# Synthethic Data Generation:

In [6]:
import pandas as pd
import dotenv
import googlemaps
import os
from sklearn.preprocessing import MinMaxScaler
# Initialize google maps client
dotenv.load_dotenv()

gmaps_key = os.getenv("GOOGLE_MAPS_API_KEY")
gmaps = googlemaps.Client(key=gmaps_key)

# Load in the dataset
rides = pd.read_csv("Synthetic_Data/RideShares.csv")

# We want to change Airport into a new column consisting of the coordinates of said airport
## We can use Google Map's API called Geocoding API / GeoLocation API: https://github.com/googlemaps/google-maps-services-python

def get_airport_coordinates(airport_name):
    """
    This function will use OpenStreetMapAPI in order to query for the coordinates of an aiport 
    
    input: airport name we will pass into the API
    output: a tuple that holds both (lat, long)
    """
    try:
        geocode_result = gmaps.geocode(f'{airport_name} Airport')
        if geocode_result: 
            location = geocode_result[0]["geometry"]["location"]
            return location ["lat"], location["lng"]
    except Exception as e:
        print(f"Error fetching coordinates for {airport_name}: {e}")
    return None, None
    
# append coordinates to new columns
rides[['latitude', 'longitude']] = rides['Airport'].apply(lambda x: pd.Series(get_airport_coordinates(x)))

# drop any rows with missing coordinates (if any)
rides.dropna(subset=['latitude', 'longitude'], inplace=True)
    
# Make this coordinates into a normalized range (Still be able to use Euclidian Distances)
scaler = MinMaxScaler()
rides[["latitude", "longitude"]] = scaler.fit_transform(rides[["latitude", "longitude"]])

In [7]:
import pandas as pd
import dotenv
import googlemaps
import os
from sklearn.preprocessing import MinMaxScaler
# Initialize google maps client
dotenv.load_dotenv()

gmaps_key = os.getenv("GOOGLE_MAPS_API_KEY")
gmaps = googlemaps.Client(key=gmaps_key)

# Load in the dataset
rides = pd.read_csv("Synthetic_Data/RideShares.csv")

# We want to change Airport into a new column consisting of the coordinates of said airport
## We can use Google Map's API called Geocoding API / GeoLocation API: https://github.com/googlemaps/google-maps-services-python

def get_airport_coordinates(airport_name):
    """
    This function will use OpenStreetMapAPI in order to query for the coordinates of an aiport 
    
    input: airport name we will pass into the API
    output: a tuple that holds both (lat, long)
    """
    try:
        geocode_result = gmaps.geocode(f'{airport_name} Airport')
        if geocode_result: 
            location = geocode_result[0]["geometry"]["location"]
            return location ["lat"], location["lng"]
    except Exception as e:
        print(f"Error fetching coordinates for {airport_name}: {e}")
    return None, None
    
# append coordinates to new columns
rides[['latitude', 'longitude']] = rides['Airport'].apply(lambda x: pd.Series(get_airport_coordinates(x)))

# drop any rows with missing coordinates (if any)
rides.dropna(subset=['latitude', 'longitude'], inplace=True)
    
# Make this coordinates into a normalized range (Still be able to use Euclidian Distances)
scaler = MinMaxScaler()
rides[["latitude", "longitude"]] = scaler.fit_transform(rides[["latitude", "longitude"]])

In [8]:
import pandas as pd
from faker import Faker
import random
import csv
from datetime import datetime
from scipy.stats import truncnorm
from datetime import timedelta


# Load existing data
df = pd.read_csv('Synthetic_Data/RideShares.csv')
df['FlightDate'] = pd.to_datetime(df['FlightDate'], errors='coerce', format='%m/%d/%Y')

fake = Faker()
random.seed(42)
fake.seed_instance(42)

def generate_data():
    """
    Generate synthetic data based on real dataset's distributions
    """

#min time for range and max time for range (these will be given in time not time intervals)
#prices might vary for ontario vs lax

    # Use real airport choices
    airport = random.choice(['LAX', 'ONT'])

    #min_wait (in .25hr increments)
    min_wait = random.choice([x / 60 for x in range(15, 121, 15)])

    #max_wait
    max_wait = random.choice([x / 60 for x in range(120, 301, 15)])

    #max spending range
    if random.random() < 0.75:  
        max_spending_range = random.randrange(10, 41, 5)
    else:  
        max_spending_range = random.randrange(40, 101, 5)

    # Bags within real observed range
    def truncated_normal(mean, std, lower=0, upper=float('inf')):
        a, b = (lower - mean) / std, (upper - mean) / std  
        return truncnorm.rvs(a, b, loc=mean, scale=std)

    mean_bag_number = df['BagNumber'].mean()
    std_bag_number = df['BagNumber'].std()
    bag_no = round(truncated_normal(mean_bag_number, std_bag_number))
    

    # Dropoff within real observed range
    dropoff_range = fake.random_int(0, 10) / 10    

    from datetime import timedelta

    today = datetime.today()
    week_ago = today - timedelta(days=7)
    flight_date = fake.date_between_dates(week_ago, today)


    # Convert date to string in the desired format
    flight_date_str = flight_date.strftime('%m/%d/%Y')  # Convert to string in the format "%m/%d/%Y"

    customer = {
        'FlightDate': flight_date_str,  # Use the formatted string here
        'FlightTime': fake.time(),
        'Airport': airport,
        'MinWaitTime': min_wait,
        'MaxWaitTime': max_wait,
        'BagNumber': bag_no,
        'MaxSpendingRange': max_spending_range,
        'DropOffRange': dropoff_range
    }

    return customer

# Generate synthetic customer data
synthetic_customers = [generate_data() for i in range(10000)]

# Write synthetic data to CSV
with open('Synthetic_Data/synthetic_customer_data.csv', 'w', newline='') as csvfile:
    fieldnames = ['FlightDate', 'FlightTime', 'Airport', 'MinWaitTime', 'MaxWaitTime', 'BagNumber', 'MaxSpendingRange', 'DropOffRange']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()
    for customer in synthetic_customers:
        writer.writerow(customer)

# Data Pre-Processing

In [10]:
#Normalizing num_bags, price_range, dropoff_distance to between 0 and 1.
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler

df2 = pd.read_csv('Synthetic_Data/attempt1.csv')
features = ['BagNumber', 'MaxSpendingRange','DropOffRange', 'latitude', 'longitude', 'MinWaitTime', 'MaxWaitTime', 'ElapsedTime', 'datetime_sin', 'datetime_cos']

def normalize_dataframe(df2, features):
    df_normalized = df2.copy()
    scaler = MinMaxScaler()
    df_normalized[features] = scaler.fit_transform(df_normalized[features])
   
    return df_normalized

df2_normalized = normalize_dataframe(df2, features)
df2_normalized.to_csv("Synthetic_Data/normalized_synthetic_data.csv", index=False)


# standard scalar
df3 = pd.read_csv('Synthetic_Data/normalized_synthetic_data.csv')
def standard_scalar(df3, features):
    df = df3.copy()
    scaler2 = StandardScaler()
    df[features] = scaler2.fit_transform(df[features])
    return df

df3_standard_scalar = standard_scalar(df3, features)
df3_standard_scalar.to_csv("Synthetic_Data/standard_scalar_data.csv", index=False)

In [23]:
# Cyclical Encoding for Date & Time
import pandas as pd
import numpy as np
import sklearn.preprocessing

# Load CSV
df = pd.read_csv("Synthetic_Data/normalized_synthetic_data.csv")

# ensure date and time are in a consistent format 
# Date: MM/DD/YYYY
# Time: HH:MM:SS

df['FlightDate'] = pd.to_datetime(df['FlightDate'], format='%Y-%m-%d', errors='coerce')
df['FlightTime'] = pd.to_datetime(df['FlightTime'], errors='coerce', format='%H:%M:%S').dt.time


# Drop rows where either FlightDate or FlightTime couldn't be parsed
df.dropna(subset=['FlightDate', 'FlightTime'], inplace=True)

# combine date and time into one column that we can reference as a single variable (1/1/2025 13:00 )
df['FlightDateTime'] = df['FlightDate'].astype(str) + " " + df['FlightTime'].astype(str)

df["FlightDateTime"] = pd.to_datetime(df["FlightDateTime"]).dt.floor('min').dt.strftime("%m/%d/%Y %H:%M")

df['FlightDateTime'] = pd.to_datetime(df['FlightDateTime'], format="%m/%d/%Y %H:%M")

# Drop any rows where conversion failed
df.dropna(subset=['FlightDateTime'], inplace=True)


# Consider the earliest time in our data to serve as our base point (So Maybe January 1, 2024 will be 0)
earliest_time = df['FlightDateTime'].min()

# Also consider the latest time in ouur data 
latest_time = df['FlightDateTime'].max()

# Calculate the number of minutes between a rows date/time to our base point 
df["ElapsedTime"] = (((df['FlightDateTime'] - earliest_time).dt.total_seconds()) / 60).astype(int)

# Apply the cyclical encoding by mapping with sine and cosine for ciruclar representation where varying times across days will be seen 
 # makes a 2D mapping of time where we can see the relationship between hours of a day and days in a year

# total_minutes = (((latest_time - earliest_time).total_seconds()) / 60).astype(int)
total_minutes = int(((latest_time - earliest_time).total_seconds()) / 60)


df['datetime_sin'] = np.sin(2 * np.pi * df['ElapsedTime'] / total_minutes)

df['datetime_cos'] = np.cos(2 * np.pi * df['ElapsedTime'] / total_minutes)
 
df.to_csv("Synthetic_Data/attempt1.csv", index=False)



     FlightDate FlightTime Airport  MinWaitTime  MaxWaitTime  BagNumber  \
0    2025-01-18   15:29:44     LAX     0.000000     0.916667        0.2   
1    2024-03-16   22:23:43     LAX     0.142857     0.750000        0.8   
2    2025-01-18   10:57:58     LAX     0.428571     0.666667        0.4   
3    2024-03-15   18:30:12     ONT     0.428571     0.583333        0.8   
4    2024-12-11   14:44:07     LAX     0.857143     0.416667        0.4   
...         ...        ...     ...          ...          ...        ...   
9995 2024-03-20   20:35:31     ONT     0.285714     0.500000        0.6   
9996 2025-01-18   16:55:38     LAX     0.428571     1.000000        0.2   
9997 2024-11-30   20:04:41     LAX     1.000000     0.500000        0.2   
9998 2024-12-11   17:19:43     LAX     0.285714     0.250000        0.6   
9999 2025-01-18   12:25:13     ONT     0.714286     0.916667        0.2   

      MaxSpendingRange  DropOffRange  latitude  longitude  \
0             0.055556           1.0  