In [119]:
import pandas as pd
import numpy as np

# Data Wrangling and Creation of Final Dataframe for EDA

Using the preliminary csv file with household-level, vehicle, weather, and person-level data, we perform further cleaning and merge with dataset containing Places.

In [120]:
temp_df = pd.read_csv("merged_weather_VEH_1.csv", low_memory=False, index_col=False) # set low memory to false due to uncertain datatypes

In [121]:
temp_df

Unnamed: 0,Unnamed: 0.1,TDATE,ID,RELAT,GEND,AGE,HISP,NTVTY,LIC,USER,...,Vehicle body type,Primary fuel type,Vehicle acquired,Vehicle ownership type,Vehicle transmission type,Vehicle drive type,Vehicle cylinder count,Vehicle type,was vehicle used on travel day,reason why not
0,0,2012-02-01,1138101_2,9.0,1.0,61.0,2.0,1.0,2.0,,...,,,,,,,,,,
1,1,2012-02-01,1120264_1,1.0,1.0,51.0,9.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,
2,2,2012-02-01,1120264_2,2.0,2.0,51.0,9.0,1.0,1.0,2.0,...,5.0,1.0,1.0,1.0,1.0,2.0,4.0,2.0,1.0,
3,3,2012-02-01,1120264_3,3.0,1.0,26.0,2.0,1.0,1.0,3.0,...,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0
4,4,2012-02-01,1120296_1,1.0,2.0,58.0,1.0,1.0,2.0,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
109108,109108,,7168253_1,1.0,1.0,50.0,2.0,2.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,
109109,109109,,7168253_2,2.0,2.0,49.0,2.0,2.0,1.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,
109110,109110,,7168253_3,3.0,2.0,23.0,2.0,1.0,1.0,5.0,...,1.0,1.0,1.0,1.0,1.0,3.0,4.0,2.0,1.0,
109111,109111,,7168253_4,3.0,1.0,22.0,2.0,1.0,1.0,3.0,...,8.0,1.0,1.0,1.0,1.0,1.0,4.0,2.0,2.0,2.0


The dataset "merged_weather" that we are working with at this stage is too large and contains unneccessary columns. Let's first further simplify the dataframe by removing the following columns. These columns do not all provide meaningful data and to further simplify and clean the data, the following processing decisions were made:
- Columns were dropped if they were deemed irrelevant, redundant, or had severe class imbalance that would introduce significant bias or variance errors
- Columns with information that would result in data leakage. For example, columns with information regarding why a vehicle was not used for a trip results in the model automatically inferring that a vehicle was not used, thus revealing outcomes directly. This gives the model an unfair advantage and is not information that is not available during prediction.

In [134]:
columns = ['AREA',
          'COMPR',
          'DOW',
          'HCTRACT',
          'HSTAT',
          'ILANG',
          'Merged_BUYER',
          'Merged_CLIP',
          'Merged_HHNOV',
          'Merged_LDPER',
          'Merged_SXST',
          'Merged_TOLLB',
          'Merged_TOLLR',
          'Merged_TPTYP',
          'Merged_WXST',
          'RECDate',
          'RELAT',
          'SSTAT',
          'SXCORD',
          'SYCORD',
          'Unnamed: 0',
          'Unnamed: 0.1',
          'Vehicle drive type',
          'day_name',
          'simplified city',
          'NOGOWHY',
          'reason why not']

In [123]:
temp_df.drop(columns=columns, inplace=True)
temp_df.head()

Unnamed: 0,TDATE,ID,GEND,AGE,HISP,NTVTY,LIC,USER,TRANS,FLEX,...,precipitation_hours (h),Year of vehicle,Vehicle body type,Primary fuel type,Vehicle acquired,Vehicle ownership type,Vehicle transmission type,Vehicle cylinder count,Vehicle type,was vehicle used on travel day
0,2012-02-01,1138101_2,1.0,61.0,2.0,1.0,2.0,,2.0,2.0,...,0.0,,,,,,,,,
1,2012-02-01,1120264_1,1.0,51.0,9.0,1.0,1.0,1.0,2.0,2.0,...,0.0,2009.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0
2,2012-02-01,1120264_2,2.0,51.0,9.0,1.0,1.0,2.0,2.0,2.0,...,0.0,2004.0,5.0,1.0,1.0,1.0,1.0,4.0,2.0,1.0
3,2012-02-01,1120264_3,1.0,26.0,2.0,1.0,1.0,3.0,2.0,2.0,...,0.0,2004.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0
4,2012-02-01,1120296_1,2.0,58.0,1.0,1.0,2.0,,2.0,2.0,...,1.0,,,,,,,,,


In [124]:
df_to_merge = pd.read_csv('cleaned_places_df.csv', low_memory=False).set_index("hh_person_id")

In [125]:
df_to_merge

Unnamed: 0_level_0,Total number of people traveling on trip,Number of household members on trip,Assigned travel day,Duration of trip,Duration of activity,Zip code,Trip distance,Time of arrival,Time of departure,city_from_zip,mode_category
hh_person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1031985_1,2.0,0.0,1.0,22.0,231,94510,13.428271,09:02,12:53,San Francisco Bay Area,Auto
1031985_1,2.0,0.0,,20.0,826,94591,12.975526,13:13,02:59,San Francisco Bay Area,Auto
1031985_2,1.0,0.0,1.0,10.0,77,94589,5.125961,09:26,10:43,San Francisco Bay Area,Auto
1031985_2,1.0,0.0,,13.0,963,94591,5.126981,10:56,02:59,San Francisco Bay Area,Auto
1032036_1,3.0,2.0,1.0,20.0,415,92128,3.619057,08:35,15:30,Greater Los Angeles Area,Auto
...,...,...,...,...,...,...,...,...,...,...,...
7211560_4,2.0,1.0,1.0,45.0,286,95117,12.978730,08:09,12:55,San Francisco Bay Area,Auto
7211560_4,2.0,1.0,1.0,8.0,23,95117,2.240663,13:03,13:26,San Francisco Bay Area,Auto
7211560_4,2.0,1.0,,18.0,795,95148,12.284206,13:44,02:59,San Francisco Bay Area,Auto
7211862_3,3.0,0.0,1.0,120.0,118,95661,87.576838,17:20,19:18,Sacramento Area,Auto


In [126]:
# Compare the sizes of the DataFrames
print(f"Number of rows in df_to_merge: {df_to_merge.shape[0]}")
print(f"Number of rows in temp_df: {temp_df.shape[0]}")

# Check for unique IDs in temp_df to ensure they match df_to_merge index
print(f"Number of unique IDs in df_to_merge: {df_to_merge.index.nunique()}")
print(f"Number of unique IDs in temp_df: {temp_df['ID'].nunique()}")

Number of rows in df_to_merge: 290067
Number of rows in temp_df: 109113
Number of unique IDs in df_to_merge: 69399
Number of unique IDs in temp_df: 109113


In [127]:
def final_merge(temp_df, df_to_merge):
     # Step 1: Set "ID" as the index for temp_df
     temp_df = temp_df.set_index('ID')

     # Step 2: Filter df_to_merge to keep only rows corresponding to IDs in filtered_df
     temp_df_filtered = temp_df[temp_df.index.isin(df_to_merge.index.get_level_values('hh_person_id'))]

     # Step 3: Map individual-level data to trip-level data
     expanded_individual_df = df_to_merge.index.to_series().map(temp_df_filtered.to_dict('index'))

     # Step 4: Convert the resulting Series of dictionaries into a DataFrame
     expanded_individual_df = pd.DataFrame(list(expanded_individual_df))

     # Step 5: Align the index with filtered_df
     expanded_individual_df.index = df_to_merge.index

     # Step 6: Combine trip-level data with expanded individual-level data
     final_df = pd.concat([df_to_merge, expanded_individual_df], axis=1)

     # Step 7: Filter final_df to include only rows with a recorded trip in 'Trip distance'
     final_df = final_df[final_df['Trip distance'].notnull()]
     return final_df

In [128]:
final_df = final_merge(temp_df,df_to_merge)
# Verify the updated shape and data after filtering
print(f"Final DataFrame shape after filtering for recorded trips: {final_df.shape}")
print(final_df.head())

Final DataFrame shape after filtering for recorded trips: (290067, 89)
              Total number of people traveling on trip  \
hh_person_id                                             
1031985_1                                          2.0   
1031985_1                                          2.0   
1031985_2                                          1.0   
1031985_2                                          1.0   
1032036_1                                          3.0   

              Number of household members on trip  Assigned travel day  \
hh_person_id                                                             
1031985_1                                     0.0                  1.0   
1031985_1                                     0.0                  NaN   
1031985_2                                     0.0                  1.0   
1031985_2                                     0.0                  NaN   
1032036_1                                     2.0                  1.0   

   

In [129]:
final_df

Unnamed: 0_level_0,Total number of people traveling on trip,Number of household members on trip,Assigned travel day,Duration of trip,Duration of activity,Zip code,Trip distance,Time of arrival,Time of departure,city_from_zip,...,precipitation_hours (h),Year of vehicle,Vehicle body type,Primary fuel type,Vehicle acquired,Vehicle ownership type,Vehicle transmission type,Vehicle cylinder count,Vehicle type,was vehicle used on travel day
hh_person_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1031985_1,2.0,0.0,1.0,22.0,231,94510,13.428271,09:02,12:53,San Francisco Bay Area,...,0.0,2006.0,1.0,1.0,2.0,1.0,1.0,4.0,2.0,1.0
1031985_1,2.0,0.0,,20.0,826,94591,12.975526,13:13,02:59,San Francisco Bay Area,...,0.0,2006.0,1.0,1.0,2.0,1.0,1.0,4.0,2.0,1.0
1031985_2,1.0,0.0,1.0,10.0,77,94589,5.125961,09:26,10:43,San Francisco Bay Area,...,0.0,1987.0,5.0,1.0,2.0,1.0,,,2.0,2.0
1031985_2,1.0,0.0,,13.0,963,94591,5.126981,10:56,02:59,San Francisco Bay Area,...,0.0,1987.0,5.0,1.0,2.0,1.0,,,2.0,2.0
1032036_1,3.0,2.0,1.0,20.0,415,92128,3.619057,08:35,15:30,Greater Los Angeles Area,...,0.0,2007.0,8.0,1.0,1.0,1.0,1.0,4.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7211560_4,2.0,1.0,1.0,45.0,286,95117,12.978730,08:09,12:55,San Francisco Bay Area,...,,,,,,,,,,
7211560_4,2.0,1.0,1.0,8.0,23,95117,2.240663,13:03,13:26,San Francisco Bay Area,...,,,,,,,,,,
7211560_4,2.0,1.0,,18.0,795,95148,12.284206,13:44,02:59,San Francisco Bay Area,...,,,,,,,,,,
7211862_3,3.0,0.0,1.0,120.0,118,95661,87.576838,17:20,19:18,Sacramento Area,...,,,,,,,,,,


In [130]:
final_df.to_csv('Final_Merged_df.csv')

In [131]:
# Calculate the number of trips per agent
trips_per_agent = final_df.groupby('hh_person_id').size()

# Calculate the average number of trips
average_trips_per_agent = trips_per_agent.mean()

print(f"Average number of trips per agent: {average_trips_per_agent:.2f}")

Average number of trips per agent: 4.18


In [132]:
import pandas as pd

def get_missing_value_summary(df, threshold=10):
    """
    Returns a DataFrame summarizing columns with a missing value percentage above the specified threshold.
    
    Parameters:
        df (pd.DataFrame): The DataFrame to analyze.
        threshold (float): The minimum percentage of missing values to include a column (default is 10%).
    
    Returns:
        pd.DataFrame: A DataFrame with columns for column names and their missing value percentages.
    """
    # Calculate the percentage of missing values for each column
    missing_percentage = df.isnull().mean() * 100
    
    # Filter columns that have missing values above the threshold
    columns_above_threshold = missing_percentage[missing_percentage >= threshold]
    
    # Create a DataFrame summarizing the results
    summary_df = pd.DataFrame({
        'Column': columns_above_threshold.index,
        'Missing Percentage': columns_above_threshold.values
    }).sort_values(by='Missing Percentage', ascending=False)
    
    return summary_df

In [133]:
# Example usage:
missing_summary_df = get_missing_value_summary(final_df, threshold=10)

from IPython.display import display

# Display the DataFrame in the notebook
display(missing_summary_df)

Unnamed: 0,Column,Missing Percentage
23,SMODE,77.597245
22,SZIP,77.595866
21,SCITY,77.595866
20,SCHOL,75.994167
18,TRNSUB,73.437171
8,WKSTAT,69.933498
11,WZIP,54.228161
15,WMODE,54.191618
2,Assigned travel day,47.997187
13,HOURS,45.90043
