In [1]:
# Import 3rd party libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Configure Notebook
%matplotlib inline
plt.style.use('fivethirtyeight')
sns.set_context("notebook")
import warnings
warnings.filterwarnings('ignore')

#### Open the merged dataset of the HH and INDIV datasets.

In [2]:
merged_df = pd.read_csv('Merged_Deliv_1.csv')

This step involves extracting the zipcodes from the main dataset and simplifying the cities to which these zipcodes are assigned to.

In [3]:
# Ensure ZIP codes are in string format to use prefix matching
merged_df['HZIP'] = merged_df['HZIP'].astype(str)

# Define ZIP code prefixes for California regions
zip_prefixes = {
    'Los Angeles': ['900', '902', '903', '904', '905', '906', '907', '908'],
    'Pasadena': ['910', '911', '912', '913', '914', '915', '916', '917', '918'],
    'San Diego': ['919', '920', '921'],
    'Riverside': ['922', '923', '925'],
    'Orange County': ['926', '927', '928'],
    'San Francisco Bay Area': ['940', '941', '943', '944', '945', '946', '947', '948', '949'],
    'San Jose': ['950', '951'],
    'Stockton': ['952'],
    'Sacramento': ['956', '957', '958'],
    'Fresno Area': ['936', '937'],
    'Ventura and Santa Barbara Area': ['930', '931', '932'],
    'Northern California': ['954', '955', '959', '976', '960', '961'],
    'California Central Coast': ['934', '939'],
    'Central California': ['933', '953'],
    'Southern California': ['935'],
    'Inland Empire': ['924', '912']
}

# Define a function to find the matching city
def assign_city(zipcode):
    for city, prefixes in zip_prefixes.items():
        if zipcode[:3] in prefixes:
            return city
    return None  # If no match, return None

# Apply the function to create the 'simplified city' column
merged_df['simplified city'] = merged_df['HZIP'].apply(assign_city)

To merge weather data onto the dataset, we need a date. Hence, we will merge the ASSN_TravelDate.csv dataset to the main dataset to obtain a temporal feature which can be used as a key.

In [4]:
df_dates = pd.read_csv("ASSN_TravelDate.csv")

# Merge the DataFrames on the "SAMPN" column
merged_df1 = pd.merge(merged_df, df_dates, on="ASSN", how="left")

# Convert the date into datetime
merged_df1['TDATE'] = pd.to_datetime(merged_df1['TDATE'])

# Display the first few rows of the merged DataFrame
merged_df1.head()

Unnamed: 0,SAMPN,PERNO,RELAT,GEND,AGE,HISP,NTVTY,LIC,USER,TRANS,...,HCTRACT,HPrimaryCity,HHWGT,EXPHHWGT,Merged_BUYER,Merged_HHNOV,Merged_LDPER,simplified city,TDATE,day_name
0,1046924.0,1.0,1.0,2.0,77.0,2.0,1.0,1.0,1.0,2.0,...,19101,PAUMA VALLEY,2.134283,625.389658,"[nan, nan, nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan, nan, nan]",San Diego,2012-07-18,Wednesday
1,1046924.0,2.0,2.0,1.0,77.0,2.0,1.0,1.0,1.0,2.0,...,19101,PAUMA VALLEY,2.134283,625.389658,"[nan, nan, nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan, nan, nan]",San Diego,2012-07-18,Wednesday
2,1047092.0,1.0,1.0,2.0,50.0,2.0,1.0,1.0,1.0,2.0,...,427700,ALAMEDA,0.784909,229.994688,"[2.0, nan, nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan, nan, nan]","[98.0, nan, nan, nan, nan, nan, nan, nan]",San Francisco Bay Area,2012-05-05,Saturday
3,1047092.0,2.0,2.0,1.0,51.0,2.0,1.0,1.0,2.0,2.0,...,427700,ALAMEDA,0.784909,229.994688,"[2.0, nan, nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan, nan, nan]","[98.0, nan, nan, nan, nan, nan, nan, nan]",San Francisco Bay Area,2012-05-05,Saturday
4,1048704.0,1.0,1.0,2.0,36.0,1.0,2.0,2.0,,2.0,...,207,MONTCLAIR,1.853113,543.001077,"[nan, nan, nan, nan, nan, nan, nan, nan]","[2.0, nan, nan, nan, nan, nan, nan, nan]","[nan, nan, nan, nan, nan, nan, nan, nan]",Pasadena,2012-07-05,Thursday


The following function processes weather data into a format that will allow for its merging to the main dataset.

In [5]:
def process_weather_city(df, city):
    """
    Processes a weather DataFrame by:
    1. Converting Unix timestamp to EST and storing in 'TDATE' column.
    2. Dropping the original time column.
    3. Reordering columns to make 'TDATE' the first column.
    4. Adding a 'simplified city' column with the given city name.

    Parameters:
    - df (pd.DataFrame): The input DataFrame with a 'time' column (Unix time in seconds).
    - city (str): The city name to assign to the 'simplified city' column.

    Returns:
    - pd.DataFrame: The processed DataFrame.
    """
    # Convert the Unix timestamp to EST and create the 'TDATE' column
    def convert_to_est(unix_time):
        utc_time = pd.to_datetime(unix_time, unit='s', utc=True)
        est_time = utc_time - pd.Timedelta(hours=5)
        return est_time

    df['TDATE'] = df['time'].apply(convert_to_est)

    # Convert 'TDATE' to date-only format and ensure it's datetime
    df['TDATE'] = df['TDATE'].dt.date
    df['TDATE'] = pd.to_datetime(df['TDATE'])

    # Drop the 'time' column
    df = df.drop(columns='time')

    # Reorder columns to make 'TDATE' the first column
    df = df[['TDATE'] + [col for col in df.columns if col != 'TDATE']]

    # Add the 'simplified city' column
    df['simplified city'] = city

    return df


Import and prepare the weather data to be merged with the dataset.

In [6]:
# Prepare the weather dataset for LA
df_LA = pd.read_csv('LA_raw.csv', header=2)
df_LA = process_weather_city(df_LA,'Los Angeles')

# Prepare the weather dataset for Pasadena
df_PA = pd.read_csv('Pasadena_raw.csv', header=2)
df_PA = process_weather_city(df_PA,'Pasadena')

# Prepare the weather dataset for San Diego
df_SD = pd.read_csv('SanDiego_raw.csv', header=2)
df_SD = process_weather_city(df_SD,'San Diego')

# Prepare the weather dataset for Riverside
df_RS = pd.read_csv('Riverside_raw.csv', header=2)
df_RS = process_weather_city(df_RS,'Riverside')

# Prepare the weather dataset for Orange County
df_OC = pd.read_csv('OrangeCounty_raw.csv', header=2)
df_OC = process_weather_city(df_OC,'Orange County')

# Prepare the weather dataset for San Francisco Bay Area
df_SFBA = pd.read_csv('SFBA_raw.csv', header=2)
df_SFBA = process_weather_city(df_SFBA,'San Francisco Bay Area')

# Prepare the weather dataset for San Jose
df_SJ = pd.read_csv('SanJose_raw.csv', header=2)
df_SJ = process_weather_city(df_SJ,'San Jose')

# Prepare the weather dataset for Stockton
df_Stn = pd.read_csv('Stockton_raw.csv', header=2)
df_Stn = process_weather_city(df_Stn,'Stockton')

# Prepare the weather dataset for Sacramento
df_Sc = pd.read_csv('Sacramento_raw.csv', header=2)
df_Sc = process_weather_city(df_Sc,'Sacramento')

# Prepare the weather dataset for Fresno Area
df_FA = pd.read_csv('FresnoArea_raw.csv', header=2)
df_FA = process_weather_city(df_FA,'Fresno Area')

# Prepare the weather dataset for Ventura and Santa Barbara Area
df_VSB = pd.read_csv('VSB_raw.csv', header=2)
df_VSB = process_weather_city(df_VSB,'Ventura and Santa Barbara Area')

# Prepare the weather dataset for Northern Cali
df_NC = pd.read_csv('NC_raw.csv', header=2)
df_NC = process_weather_city(df_NC,'Northern California')

# Prepare the weather dataset for Cali Central Coast
df_CCCo = pd.read_csv('CCCoast_raw.csv', header=2)
df_CCCo = process_weather_city(df_CCCo,'California Central Coast')

# Prepare the weather dataset for Central Cali
df_CC = pd.read_csv('CC_raw.csv', header=2)
df_CC = process_weather_city(df_CC,'Central California')

# Prepare the weather dataset for Southern Cali
df_SouthC = pd.read_csv('SouthC_raw.csv', header=2)
df_SouthC = process_weather_city(df_SouthC,'Southern California')

# Prepare the weather dataset for Inland Empire
df_IE = pd.read_csv('IE_raw.csv', header=2)
df_IE = process_weather_city(df_IE,'Inland Empire')

Now, we are ready to merge the weather data onto the main dataset. The keys used are the cities and the travel date.

In [7]:
# Combine all weather DataFrames
weather_df = pd.concat([df_LA, df_PA, df_SD, df_RS, df_OC, df_SFBA, df_SJ, df_Stn, df_Sc, df_FA, df_VSB, df_NC, df_CCCo, df_CC, df_SouthC, df_IE], ignore_index=True)

# Merge the main DataFrame with the combined weather DataFrame
merged_df_weather = pd.merge(merged_df1, weather_df, on=['simplified city', 'TDATE'], how='left')

# View DataFrame
merged_df_weather.head()

Unnamed: 0,SAMPN,PERNO,RELAT,GEND,AGE,HISP,NTVTY,LIC,USER,TRANS,...,weather_code (wmo code),temperature_2m_max (°C),temperature_2m_min (°C),temperature_2m_mean (°C),apparent_temperature_mean (°C),daylight_duration (s),precipitation_sum (mm),rain_sum (mm),snowfall_sum (cm),precipitation_hours (h)
0,1046924.0,1.0,1.0,2.0,77.0,2.0,1.0,1.0,1.0,2.0,...,51.0,24.3,16.2,21.0,21.1,50522.89,0.2,0.2,0.0,1.0
1,1046924.0,2.0,2.0,1.0,77.0,2.0,1.0,1.0,1.0,2.0,...,51.0,24.3,16.2,21.0,21.1,50522.89,0.2,0.2,0.0,1.0
2,1047092.0,1.0,1.0,2.0,50.0,2.0,1.0,1.0,1.0,2.0,...,0.0,22.2,7.6,15.2,12.1,50133.78,0.0,0.0,0.0,0.0
3,1047092.0,2.0,2.0,1.0,51.0,2.0,1.0,1.0,2.0,2.0,...,0.0,22.2,7.6,15.2,12.1,50133.78,0.0,0.0,0.0,0.0
4,1048704.0,1.0,1.0,2.0,36.0,1.0,2.0,2.0,,2.0,...,0.0,26.1,13.7,19.6,20.1,51642.6,0.0,0.0,0.0,0.0


In [8]:
# Save the new dataset
merged_df_weather.to_csv('merged_weather_1.csv')

#### Open the cleaned VEH dataset

In [9]:
clean_veh = pd.read_csv('cleaned_veh_df.csv')

In [10]:
# View DataFrame
clean_veh.head()

Unnamed: 0.1,Unnamed: 0,hh_veh_id,Year of vehicle,Vehicle body type,Primary fuel type,Vehicle acquired,Vehicle ownership type,Vehicle transmission type,Vehicle drive type,Vehicle cylinder count,Vehicle type,was vehicle used on travel day,reason why not
0,0,1031985_1,2006.0,1.0,1.0,2.0,1.0,1.0,2.0,4.0,2.0,1,
1,1,1031985_2,1987.0,5.0,1.0,2.0,1.0,,,,2.0,2,
2,2,1032036_1,2007.0,8.0,1.0,1.0,1.0,1.0,1.0,4.0,2.0,1,
3,3,1032053_1,2001.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,1,
4,4,1032053_2,2003.0,2.0,1.0,2.0,1.0,1.0,3.0,5.0,2.0,1,


In [12]:
# Convert the 'SAMPN' from float to int
merged_df_weather['SAMPN'] = merged_df_weather['SAMPN'].astype(int)

# Convert the 'SAMPN' from float to int
merged_df_weather['PERNO'] = merged_df_weather['PERNO'].astype(int)

# Create a new column 'hh_veh_id' by merging 'SAMPN' and 'PERNO'
merged_df_weather['hh_veh_id'] = merged_df_weather['SAMPN'].astype(str) + '_' + merged_df_weather['PERNO'].astype(str)

# Reorder columns to make 'hh_veh_id' the first column
merged_df_weather = merged_df_weather[['hh_veh_id'] + [col for col in merged_df_weather.columns if col != 'hh_veh_id']]

# Drop 'SAMPN' and 'PERNO'
merged_df_weather = merged_df_weather.drop(columns=['SAMPN', 'PERNO'])

# Reorder the date to be first too
merged_df_weather = merged_df_weather[['TDATE'] + [col for col in merged_df_weather.columns if col != 'TDATE']]

In [13]:
merged_df_weather.head()

Unnamed: 0,TDATE,hh_veh_id,RELAT,GEND,AGE,HISP,NTVTY,LIC,USER,TRANS,...,weather_code (wmo code),temperature_2m_max (°C),temperature_2m_min (°C),temperature_2m_mean (°C),apparent_temperature_mean (°C),daylight_duration (s),precipitation_sum (mm),rain_sum (mm),snowfall_sum (cm),precipitation_hours (h)
0,2012-07-18,1046924_1,1.0,2.0,77.0,2.0,1.0,1.0,1.0,2.0,...,51.0,24.3,16.2,21.0,21.1,50522.89,0.2,0.2,0.0,1.0
1,2012-07-18,1046924_2,2.0,1.0,77.0,2.0,1.0,1.0,1.0,2.0,...,51.0,24.3,16.2,21.0,21.1,50522.89,0.2,0.2,0.0,1.0
2,2012-05-05,1047092_1,1.0,2.0,50.0,2.0,1.0,1.0,1.0,2.0,...,0.0,22.2,7.6,15.2,12.1,50133.78,0.0,0.0,0.0,0.0
3,2012-05-05,1047092_2,2.0,1.0,51.0,2.0,1.0,1.0,2.0,2.0,...,0.0,22.2,7.6,15.2,12.1,50133.78,0.0,0.0,0.0,0.0
4,2012-07-05,1048704_1,1.0,2.0,36.0,1.0,2.0,2.0,,2.0,...,0.0,26.1,13.7,19.6,20.1,51642.6,0.0,0.0,0.0,0.0


In [14]:
# Merge the VEH df onto the merged_weather df
merged_df_weather_veh = pd.merge(merged_df_weather, clean_veh, on=['hh_veh_id'], how='left')

# Rename the ID column
merged_df_weather_veh.rename(columns={'hh_veh_id': 'ID'}, inplace=True)

# Sort the DataFrame by TDATE in chronological order
merged_df_weather_veh = merged_df_weather_veh.sort_values(by='TDATE', ascending=True).reset_index(drop=True)

In [15]:
merged_df_weather_veh.head()

Unnamed: 0,TDATE,ID,RELAT,GEND,AGE,HISP,NTVTY,LIC,USER,TRANS,...,Vehicle body type,Primary fuel type,Vehicle acquired,Vehicle ownership type,Vehicle transmission type,Vehicle drive type,Vehicle cylinder count,Vehicle type,was vehicle used on travel day,reason why not
0,2012-02-01,1138101_2,9.0,1.0,61.0,2.0,1.0,2.0,,2.0,...,,,,,,,,,,
1,2012-02-01,1120264_1,1.0,1.0,51.0,9.0,1.0,1.0,1.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,
2,2012-02-01,1120264_2,2.0,2.0,51.0,9.0,1.0,1.0,2.0,2.0,...,5.0,1.0,1.0,1.0,1.0,2.0,4.0,2.0,1.0,
3,2012-02-01,1120264_3,3.0,1.0,26.0,2.0,1.0,1.0,3.0,2.0,...,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0
4,2012-02-01,1120296_1,1.0,2.0,58.0,1.0,1.0,2.0,,2.0,...,,,,,,,,,,


In [16]:
# Save the new dataset
merged_df_weather_veh.to_csv('merged_weather_VEH_1.csv')