# Data Preprocessing

In [2]:
import googlemaps
from datetime import datetime

gmaps = googlemaps.Client(key='Add Your Key here')

# Geocoding an address
geocode_result = gmaps.geocode(f'{airport_name} Airport')

# Look up an address with reverse geocoding
reverse_geocode_result = gmaps.reverse_geocode((40.714224, -73.961452))

# Request directions via public transit
now = datetime.now()
directions_result = gmaps.directions("Sydney Town Hall",
                                     "Parramatta, NSW",
                                     mode="transit",
                                     departure_time=now)

# Validate an address with address validation
addressvalidation_result =  gmaps.addressvalidation(['1600 Amphitheatre Pk'], 
                                                    regionCode='US',
                                                    locality='Mountain View', 
                                                    enableUspsCass=True)

# Get an Address Descriptor of a location in the reverse geocoding response
address_descriptor_result = gmaps.reverse_geocode((40.714224, -73.961452), enable_address_descriptor=True)


ModuleNotFoundError: No module named 'googlemaps'

In [None]:
import pandas as pd
import dotenv
import googlemaps
import os
from sklearn.preprocessing import MinMaxScaler
# Initialize google maps client
dotenv.load_dotenv()

gmaps_key = os.getenv("GOOGLE_MAPS_API_KEY")
gmaps = googlemaps.Client(key=gmaps_key)

# Load in the dataset
rides = pd.read_csv("RideShares.csv")

# We want to change Airport into a new column consisting of the coordinates of said airport
## We can use Google Map's API called Geocoding API / GeoLocation API: https://github.com/googlemaps/google-maps-services-python

def get_airport_coordinates(airport_name):
    """
    This function will use OpenStreetMapAPI in order to query for the coordinates of an aiport 
    
    input: airport name we will pass into the API
    output: a tuple that holds both (lat, long)
    """
    try:
        geocode_result = gmaps.geocode(f'{airport_name} Airport')
        if geocode_result: 
            location = geocode_result[0]["geometry"]["location"]
            return location ["lat"], location["lng"]
    except Exception as e:
        print(f"Error fetching coordinates for {airport_name}: {e}")
    return None, None
    
# append coordinates to new columns
rides[['latitude', 'longitude']] = rides['Airport'].apply(lambda x: pd.Series(get_airport_coordinates(x)))

# drop any rows with missing coordinates (if any)
rides.dropna(subset=['latitude', 'longitude'], inplace=True)
    
# Make this coordinates into a normalized range (Still be able to use Euclidian Distances)
scaler = MinMaxScaler()
rides[["latitude", "longitude"]] = scaler.fit_transform(rides[["latitude", "longitude"]])

In [2]:
!pip install Faker

Collecting Faker
  Using cached faker-37.0.0-py3-none-any.whl.metadata (15 kB)
Using cached faker-37.0.0-py3-none-any.whl (1.9 MB)
Installing collected packages: Faker
Successfully installed Faker-37.0.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [None]:
import pandas as pd
from faker import Faker
import random
import csv
from datetime import datetime
from scipy.stats import truncnorm

# Load existing data
df = pd.read_csv('RideShares.csv')
df['FlightDate'] = pd.to_datetime(df['FlightDate'], errors='coerce', format='%m/%d/%Y')

fake = Faker()
random.seed(42)
fake.seed_instance(42)

def generate_data():
    """
    Generate synthetic data based on real dataset's distributions
    """

#min time for range and max time for range (these will be given in time not time intervals)
#prices might vary for ontario vs lax

    # Use real airport choices
    airport = random.choice(['LAX', 'ONT'])

    #min_wait (in .25hr increments)
    min_wait = random.choice([x / 60 for x in range(15, 121, 15)])

    #max_wait
    max_wait = random.choice([x / 60 for x in range(15, 301, 15)])

    #max spending range
    if random.random() < 0.75:  
        max_spending_range = random.randrange(10, 41, 5)
    else:  
        max_spending_range = random.randrange(40, 101, 5)

    # Bags within real observed range
    def truncated_normal(mean, std, lower=0, upper=float('inf')):
        a, b = (lower - mean) / std, (upper - mean) / std  
        return truncnorm.rvs(a, b, loc=mean, scale=std)

    mean_bag_number = df['BagNumber'].mean()
    std_bag_number = df['BagNumber'].std()
    bag_no = round(truncated_normal(mean_bag_number, std_bag_number))
    

    # Dropoff within real observed range
    dropoff_range = fake.random_int(0, 10) / 10    

    # Define date ranges for breaks
    end_of_winter_break_start = datetime(2025, 1, 18)
    end_of_winter_break_end = datetime(2025, 1, 20)
    start_of_winter_break_start = datetime(2024, 12, 11)
    start_of_winter_break_end = datetime(2024, 12, 14)
    spring_break_start = datetime(2024, 3, 14)
    spring_break_end = datetime(2024, 3, 23)
    thanksgiving_start = datetime(2024, 11, 30)
    thanksgiving_end = datetime(2024, 12, 1)

    # Randomly choose one of these two periods
    period_choice = random.choice([('spring_break', spring_break_start, spring_break_end),
                                   ('end_of_winter_break', end_of_winter_break_start, end_of_winter_break_end),
                                   ('start_of_winter_break',start_of_winter_break_start,start_of_winter_break_end),
                                ('thanksgiving', thanksgiving_start, thanksgiving_end)])

    # Sample a date based on the chosen period
    period, start_date, end_date = period_choice
    flight_date = fake.date_between_dates(start_date, end_date)

    # Convert date to string in the desired format
    flight_date_str = flight_date.strftime('%m/%d/%Y')  # Convert to string in the format "%m/%d/%Y"

    customer = {
        'FlightDate': flight_date_str,  # Use the formatted string here
        'FlightTime': fake.time(),
        'Airport': airport,
        'MinWaitTime': min_wait,
        'MaxWaitTime': max_wait,
        'BagNumber': bag_no,
        'MaxSpendingRange': max_spending_range,
        'DropOffRange': dropoff_range
    }

    return customer

# Generate synthetic customer data
synthetic_customers = [generate_data() for i in range(10000)]

# Write synthetic data to CSV
with open('synthetic_customer_data.csv', 'w', newline='') as csvfile:
    fieldnames = ['FlightDate', 'FlightTime', 'Airport', 'MinWaitTime', 'MaxWaitTime', 'BagNumber', 'MaxSpendingRange', 'DropOffRange']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for customer in synthetic_customers:
        writer.writerow(customer)

In [10]:
#Normalizing num_bags, price_range, dropoff_distance to between 0 and 1.
from sklearn.preprocessing import MinMaxScaler

df2 = pd.read_csv('synthetic_customer_data.csv')
columns_to_normalize = ['BagNumber', 'MaxSpendingRange','DropOffRange']

def normalize_dataframe(df2, columns_to_normalize):
    df_normalized = df2.copy()
    scaler = MinMaxScaler()
    df_normalized[columns_to_normalize] = scaler.fit_transform(df_normalized[columns_to_normalize])
    return df_normalized

df2_normalized = normalize_dataframe(df2, columns_to_normalize)
df2_normalized.to_csv("normalized_synthetic_data.csv", index=False)