In [1]:
# Import 3rd party libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Configure Notebook
%matplotlib inline
plt.style.use('fivethirtyeight')
sns.set_context("notebook")
import warnings
warnings.filterwarnings('ignore')

#### Open the merged dataset of the HH, PER and weather datasets.

In [2]:
merged_weather = pd.read_csv('merged_weather.csv')

#### Open the cleaned VEH dataset

In [3]:
clean_veh = pd.read_csv('cleaned_veh_df.csv')

In [4]:
# View DataFrame
clean_veh.head()

Unnamed: 0.1,Unnamed: 0,hh_veh_id,Year of vehicle,Vehicle body type,Primary fuel type,Vehicle acquired,Vehicle ownership type,Vehicle transmission type,Vehicle drive type,Vehicle cylinder count,Vehicle type,was vehicle used on travel day,reason why not
0,0,1031985_1,2006.0,1.0,1.0,2.0,1.0,1.0,2.0,4.0,2.0,1,
1,1,1031985_2,1987.0,5.0,1.0,2.0,1.0,,,,2.0,2,
2,2,1032036_1,2007.0,8.0,1.0,1.0,1.0,1.0,1.0,4.0,2.0,1,
3,3,1032053_1,2001.0,1.0,1.0,2.0,1.0,1.0,1.0,2.0,2.0,1,
4,4,1032053_2,2003.0,2.0,1.0,2.0,1.0,1.0,3.0,5.0,2.0,1,


In [5]:
# Convert the 'SAMPN' from float to int
merged_weather['SAMPN'] = merged_weather['SAMPN'].astype(int)

# Convert the 'SAMPN' from float to int
merged_weather['PERNO'] = merged_weather['PERNO'].astype(int)

# Create a new column 'hh_veh_id' by merging 'SAMPN' and 'PERNO'
merged_weather['hh_veh_id'] = merged_weather['SAMPN'].astype(str) + '_' + merged_weather['PERNO'].astype(str)

# Reorder columns to make 'hh_veh_id' the first column
merged_weather = merged_weather[['hh_veh_id'] + [col for col in merged_weather.columns if col != 'hh_veh_id']]

# Drop 'SAMPN' and 'PERNO'
merged_weather = merged_weather.drop(columns=['SAMPN', 'PERNO', 'Unnamed: 0', 'ID'])

# Reorder the date to be first too
merged_weather = merged_weather[['TDATE'] + [col for col in merged_weather.columns if col != 'TDATE']]

In [6]:
merged_weather.head()

Unnamed: 0,TDATE,hh_veh_id,RELAT,GEND,AGE,AGEB,HISP,RACE1,RACE2,RACE3,...,weather_code (wmo code),temperature_2m_max (°C),temperature_2m_min (°C),temperature_2m_mean (°C),apparent_temperature_mean (°C),daylight_duration (s),precipitation_sum (mm),rain_sum (mm),snowfall_sum (cm),precipitation_hours (h)
0,2012-07-18,1046924_1,1.0,2.0,77.0,,2.0,1.0,,,...,51.0,24.3,16.2,21.0,21.1,50522.89,0.2,0.2,0.0,1.0
1,2012-07-18,1046924_2,2.0,1.0,77.0,,2.0,1.0,,,...,51.0,24.3,16.2,21.0,21.1,50522.89,0.2,0.2,0.0,1.0
2,2012-05-05,1047092_1,1.0,2.0,50.0,,2.0,1.0,,,...,0.0,22.2,7.6,15.2,12.1,50133.78,0.0,0.0,0.0,0.0
3,2012-05-05,1047092_2,2.0,1.0,51.0,,2.0,1.0,,,...,0.0,22.2,7.6,15.2,12.1,50133.78,0.0,0.0,0.0,0.0
4,2012-07-05,1048704_1,1.0,2.0,36.0,,1.0,1.0,,,...,0.0,26.1,13.7,19.6,20.1,51642.6,0.0,0.0,0.0,0.0


In [7]:
# Merge the VEH df onto the merged_weather df
merged_df_weather_veh = pd.merge(merged_weather, clean_veh, on=['hh_veh_id'], how='left')

# Rename the ID column
merged_df_weather_veh.rename(columns={'hh_veh_id': 'ID'}, inplace=True)

# Sort the DataFrame by TDATE in chronological order
merged_df_weather_veh = merged_df_weather_veh.sort_values(by='TDATE', ascending=True).reset_index(drop=True)

In [8]:
merged_df_weather_veh.head()

Unnamed: 0,TDATE,ID,RELAT,GEND,AGE,AGEB,HISP,RACE1,RACE2,RACE3,...,Vehicle body type,Primary fuel type,Vehicle acquired,Vehicle ownership type,Vehicle transmission type,Vehicle drive type,Vehicle cylinder count,Vehicle type,was vehicle used on travel day,reason why not
0,2012-02-01,1138101_2,9.0,1.0,61.0,,2.0,1.0,,,...,,,,,,,,,,
1,2012-02-01,1120264_1,1.0,1.0,51.0,,9.0,99.0,,,...,1.0,1.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,
2,2012-02-01,1120264_2,2.0,2.0,51.0,,9.0,99.0,,,...,5.0,1.0,1.0,1.0,1.0,2.0,4.0,2.0,1.0,
3,2012-02-01,1120264_3,3.0,1.0,26.0,,2.0,99.0,,,...,1.0,1.0,1.0,1.0,2.0,1.0,2.0,2.0,2.0,2.0
4,2012-02-01,1120296_1,1.0,2.0,58.0,,1.0,97.0,,,...,,,,,,,,,,


In [9]:
# Save the new dataset
merged_df_weather_veh.to_csv('merged_weather_VEH.csv')