In [46]:
import pandas as pd
import numpy as np
import pickle

PREPROCESSING
---

1. Collisions dataset preprocessing
---

In [47]:
collisions = pd.read_csv('collisions.csv')
print('The collisions raw dataset shape is: ', collisions.shape)
print('The collisions raw dataset columns are: ', collisions.columns)

  collisions = pd.read_csv('collisions.csv')


The collisions raw dataset shape is:  (2041789, 29)
The collisions raw dataset columns are:  Index(['CRASH DATE', 'CRASH TIME', 'BOROUGH', 'ZIP CODE', 'LATITUDE',
       'LONGITUDE', 'LOCATION', 'ON STREET NAME', 'CROSS STREET NAME',
       'OFF STREET NAME', 'NUMBER OF PERSONS INJURED',
       'NUMBER OF PERSONS KILLED', 'NUMBER OF PEDESTRIANS INJURED',
       'NUMBER OF PEDESTRIANS KILLED', 'NUMBER OF CYCLIST INJURED',
       'NUMBER OF CYCLIST KILLED', 'NUMBER OF MOTORIST INJURED',
       'NUMBER OF MOTORIST KILLED', 'CONTRIBUTING FACTOR VEHICLE 1',
       'CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3',
       'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5',
       'COLLISION_ID', 'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2',
       'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5'],
      dtype='object')


In [48]:
# Drop the columns we are not using

columns_to_remove = ['LOCATION', 'ON STREET NAME', 'CROSS STREET NAME',
       'OFF STREET NAME', 'NUMBER OF PEDESTRIANS INJURED',
       'NUMBER OF PEDESTRIANS KILLED', 'CONTRIBUTING FACTOR VEHICLE 1',
       'CONTRIBUTING FACTOR VEHICLE 2', 'CONTRIBUTING FACTOR VEHICLE 3',
       'CONTRIBUTING FACTOR VEHICLE 4', 'CONTRIBUTING FACTOR VEHICLE 5']
collisions = collisions.drop(columns=columns_to_remove)
print('The collisions raw dataset columns are: ', collisions.columns)

The collisions raw dataset columns are:  Index(['CRASH DATE', 'CRASH TIME', 'BOROUGH', 'ZIP CODE', 'LATITUDE',
       'LONGITUDE', 'NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED',
       'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
       'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED',
       'COLLISION_ID', 'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2',
       'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5'],
      dtype='object')


In [49]:
# Affected persons (killed + injured) per collision
collisions['PERSONS_AFFECTED'] = collisions['NUMBER OF PERSONS INJURED'] + collisions['NUMBER OF PERSONS KILLED']
# Affected persons driving a 2-wheel vehicle in a collision -> Will help to visualize the proportionality
collisions['2-WHEELS_AFFECTED'] = collisions['NUMBER OF CYCLIST INJURED'] + collisions['NUMBER OF CYCLIST KILLED'] + collisions['NUMBER OF MOTORIST INJURED'] + collisions['NUMBER OF MOTORIST KILLED']

columns_to_remove = ['NUMBER OF PERSONS INJURED', 'NUMBER OF PERSONS KILLED',
       'NUMBER OF CYCLIST INJURED', 'NUMBER OF CYCLIST KILLED',
       'NUMBER OF MOTORIST INJURED', 'NUMBER OF MOTORIST KILLED']
collisions = collisions.drop(columns=columns_to_remove)
print('The collisions raw dataset columns are: ', collisions.columns)

The collisions raw dataset columns are:  Index(['CRASH DATE', 'CRASH TIME', 'BOROUGH', 'ZIP CODE', 'LATITUDE',
       'LONGITUDE', 'COLLISION_ID', 'VEHICLE TYPE CODE 1',
       'VEHICLE TYPE CODE 2', 'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4',
       'VEHICLE TYPE CODE 5', 'PERSONS_AFFECTED', '2-WHEELS_AFFECTED'],
      dtype='object')


In [50]:
# Need to cathegorize the crash time of a collision

# Convert to datetime format
collisions['CRASH TIME'] = pd.to_datetime(collisions['CRASH TIME'])

collisions['HOUR'] = collisions['CRASH TIME'].dt.hour

# Function to categorize hours into three ranges of time
def categorize_time(hour):
    if 0 <= hour < 8:
        return 'Night'
    elif 8 <= hour < 16:
        return 'Day'
    else:
        return 'Evening'

# Apply the categorization function to create a new column
collisions['CAT_CRASH_TIME'] = collisions['HOUR'].apply(categorize_time)

columns_to_remove = ['CRASH TIME', 'HOUR']
collisions = collisions.drop(columns=columns_to_remove)
print('The collisions raw dataset columns are: ', collisions.columns)

The collisions raw dataset columns are:  Index(['CRASH DATE', 'BOROUGH', 'ZIP CODE', 'LATITUDE', 'LONGITUDE',
       'COLLISION_ID', 'VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2',
       'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5',
       'PERSONS_AFFECTED', '2-WHEELS_AFFECTED', 'CAT_CRASH_TIME'],
      dtype='object')


In [51]:
def filter_collisions(collisions):
    """
        Filters the collisions dataset for the years 2018 and 2020.
    """

    # Converts the 'CRASH DATE' column to a datetime format.
    collisions['CRASH DATE'] = pd.to_datetime(collisions['CRASH DATE'], format='%m/%d/%Y')

    # Filters the datasets for the summer months of 2018 and 2020.
    def filter_summer_collisions(data, year):
        return data[
            ((data['CRASH DATE'].dt.year == year) & (data['CRASH DATE'].dt.month >= 6) & (data['CRASH DATE'].dt.month <= 8))
        ]

    summer_collisions_2018 = filter_summer_collisions(collisions, 2018)
    summer_collisions_2020 = filter_summer_collisions(collisions, 2020)

    return summer_collisions_2018, summer_collisions_2020

In [52]:
# Gets the summer collisions for 2018 and 2020 datasets.
summer_collisions_2018, summer_collisions_2020 = filter_collisions(collisions)
summer_collisions_2018['SUMMER'] = '2018'
summer_collisions_2020['SUMMER'] = '2020'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  summer_collisions_2018['SUMMER'] = '2018'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  summer_collisions_2020['SUMMER'] = '2020'


In [53]:
# Differentiates the information contained in the columns of the dataset into the following categories:
# Columns that identify collision in time.
time_columns = [
    'CRASH DATE',
    'CAT_CRASH_TIME',
    'BOROUGH',
    'ZIP CODE',
    'COLLISION_ID']

# Columns about coordinate location of the collision.
coordinates = [
    'LATITUDE',
    'LONGITUDE'
]

In [54]:
def manage_missing_values(summer_collisions):
    """
        Processes the datasets with specified criteria on information category selection (column selection).
    """

    # Replaces 'Unspecified' with NaN values.
    summer_collisions = summer_collisions.replace('Unspecified', np.nan)

    # Drops rows with missing values in categories time_columns, coordinates.
    summer_collisions = summer_collisions.dropna(subset=time_columns, how='any')
    summer_collisions = summer_collisions.dropna(subset=coordinates, how='any')

    return summer_collisions

In [55]:
# Gets the minimum missing values dataframe, with all information needed to conduct the visualizations, for both summer_collisions_2018 and summer_collisions_2020.
summer_collisions_2018 = manage_missing_values(summer_collisions_2018)
summer_collisions_2020 = manage_missing_values(summer_collisions_2020)

In [56]:
# To manage the vehicle types and their missing values
# Concatenating values of columns 1 to 5 into a single column while excluding NaN values
summer_collisions_2018['VEHICLE TYPE'] = summer_collisions_2018.iloc[:,6:11].apply(lambda row: [value for value in row.tolist() if not pd.isnull(value)], axis=1)
summer_collisions_2020['VEHICLE TYPE'] = summer_collisions_2020.iloc[:,6:11].apply(lambda row: [value for value in row.tolist() if not pd.isnull(value)], axis=1)


# Drop the original columns
summer_collisions_2018 = summer_collisions_2018.drop(columns=['VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2',
                      'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5'])
summer_collisions_2020 = summer_collisions_2020.drop(columns=['VEHICLE TYPE CODE 1', 'VEHICLE TYPE CODE 2',
                      'VEHICLE TYPE CODE 3', 'VEHICLE TYPE CODE 4', 'VEHICLE TYPE CODE 5'])

In [57]:
def add_daytype(summer_collisions):
    """
        Adds a column to the datasets indicating wheter is weekday or weekend.
    """

    # Adds a new column indicating wheter is weekday (1) or weekend (0), recoded as 'Weekday' and 'Weekend' respectively.
    summer_collisions['DAY TYPE'] = (summer_collisions['CRASH DATE'].dt.dayofweek // 5 == 1).astype(int)
    summer_collisions['DAY TYPE'] = summer_collisions['DAY TYPE'].map({0: 'Weekday', 1: 'Weekend'})

    return summer_collisions

In [58]:
# Adds the day type column into both summer_collisions_2018 and summer_collisions_2020.
summer_collisions_2018 = add_daytype(summer_collisions_2018)
summer_collisions_2020 = add_daytype(summer_collisions_2020)

In [59]:
summer_collisions_2018['ZIP CODE'] = pd.to_numeric(summer_collisions_2018['ZIP CODE'], errors='coerce', downcast='integer')
summer_collisions_2020['ZIP CODE'] = pd.to_numeric(summer_collisions_2020['ZIP CODE'], errors='coerce', downcast='integer')

In [60]:
# We don't actually care about the entire date, just the month of the summer
summer_collisions_2018['MONTH_YEAR'] = summer_collisions_2018['CRASH DATE'].dt.strftime('%B %Y')
summer_collisions_2020['MONTH_YEAR'] = summer_collisions_2020['CRASH DATE'].dt.strftime('%B %Y')

In [61]:
# Reset index for summer_collisions_2018 dataset.
summer_collisions_2018.reset_index(drop=True, inplace=True)

# Reset index for summer_collisions_2020 dataset.
summer_collisions_2020.reset_index(drop=True, inplace=True)

In [62]:
summer_collisions_2018[:10]

Unnamed: 0,CRASH DATE,BOROUGH,ZIP CODE,LATITUDE,LONGITUDE,COLLISION_ID,PERSONS_AFFECTED,2-WHEELS_AFFECTED,CAT_CRASH_TIME,SUMMER,VEHICLE TYPE,DAY TYPE,MONTH_YEAR
0,2018-08-30,BROOKLYN,11204,40.61662,-73.99972,3971526,0.0,0,Day,2018,"[Sedan, Sedan]",Weekday,August 2018
1,2018-08-31,BRONX,10472,40.827168,-73.870125,3973140,0.0,0,Night,2018,"[Sedan, Sedan]",Weekday,August 2018
2,2018-08-28,BROOKLYN,11211,40.70654,-73.95041,3969590,0.0,0,Evening,2018,"[Station Wagon/Sport Utility Vehicle, Station ...",Weekday,August 2018
3,2018-08-29,QUEENS,11694,40.586067,-73.82263,3970294,1.0,1,Day,2018,"[Station Wagon/Sport Utility Vehicle, Sedan]",Weekday,August 2018
4,2018-08-05,QUEENS,11370,40.762665,-73.8876,3989501,1.0,1,Night,2018,[Sedan],Weekend,August 2018
5,2018-08-05,MANHATTAN,10075,40.77364,-73.95986,3955175,1.0,1,Evening,2018,"[Taxi, Station Wagon/Sport Utility Vehicle]",Weekend,August 2018
6,2018-08-28,BRONX,10454,40.805016,-73.92109,3969257,0.0,0,Day,2018,"[Station Wagon/Sport Utility Vehicle, Sedan]",Weekday,August 2018
7,2018-08-10,QUEENS,11416,40.682415,-73.84451,3960223,0.0,0,Evening,2018,"[Sedan, Sedan]",Weekday,August 2018
8,2018-08-18,STATEN ISLAND,10305,40.589695,-74.067276,3963981,0.0,0,Evening,2018,[Sedan],Weekend,August 2018
9,2018-08-18,BROOKLYN,11214,40.59214,-73.98841,3963924,0.0,0,Evening,2018,"[Station Wagon/Sport Utility Vehicle, Station ...",Weekend,August 2018


In [63]:
def save(data, file_path):
    """
        Saves the dataset as a pickle file.
    """

    with open(file_path, 'wb') as file:
        pickle.dump(data, file)
    print(f"The dataset has been saved in pickle format at {file_path}.")

save(summer_collisions_2018, 'summer_collisions_2018.pkl')
save(summer_collisions_2020, 'summer_collisions_2020.pkl')

The dataset has been saved in pickle format at summer_collisions_2018.pkl.
The dataset has been saved in pickle format at summer_collisions_2020.pkl.


Weather dataset
---

In [65]:
# Loads the weather dataset.
weather = pd.read_csv('weather.csv')
print('The weather raw dataset shape is: ', weather.shape)
print('The weather raw dataset columns are: ', weather.columns)

The weather raw dataset shape is:  (90667, 66)
The weather raw dataset columns are:  Index(['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'DATE', 'AWND',
       'AWND_ATTRIBUTES', 'DAPR', 'DAPR_ATTRIBUTES', 'DASF', 'DASF_ATTRIBUTES',
       'MDPR', 'MDPR_ATTRIBUTES', 'MDSF', 'MDSF_ATTRIBUTES', 'PGTM',
       'PGTM_ATTRIBUTES', 'PRCP', 'PRCP_ATTRIBUTES', 'PSUN', 'PSUN_ATTRIBUTES',
       'SNOW', 'SNOW_ATTRIBUTES', 'SNWD', 'SNWD_ATTRIBUTES', 'TAVG',
       'TAVG_ATTRIBUTES', 'TMAX', 'TMAX_ATTRIBUTES', 'TMIN', 'TMIN_ATTRIBUTES',
       'TOBS', 'TOBS_ATTRIBUTES', 'TSUN', 'TSUN_ATTRIBUTES', 'WDF2',
       'WDF2_ATTRIBUTES', 'WDF5', 'WDF5_ATTRIBUTES', 'WESD', 'WESD_ATTRIBUTES',
       'WESF', 'WESF_ATTRIBUTES', 'WSF2', 'WSF2_ATTRIBUTES', 'WSF5',
       'WSF5_ATTRIBUTES', 'WT01', 'WT01_ATTRIBUTES', 'WT02', 'WT02_ATTRIBUTES',
       'WT03', 'WT03_ATTRIBUTES', 'WT04', 'WT04_ATTRIBUTES', 'WT05',
       'WT05_ATTRIBUTES', 'WT06', 'WT06_ATTRIBUTES', 'WT08', 'WT08_ATTRIBUTES',
       'WT

  weather = pd.read_csv('weather.csv')


In [66]:
def filter_weather(weather):
    """
        Filters the weather dataset for the years 2018 and 2020.
    """
    
    # Convert 'DATE' column to datetime format.
    weather['DATE'] = pd.to_datetime(weather['DATE'], errors='coerce')

    # Filters the datasets for the summer months of 2018 and 2020.
    def filter_summer_weather(data, year):
        return data[
            ((data['DATE'].dt.year == year) & (data['DATE'].dt.month >= 6) & (data['DATE'].dt.month <= 8))
        ]

    summer_weather_2018 = filter_summer_weather(weather, 2018)
    summer_weather_2020 = filter_summer_weather(weather, 2020)

    return summer_weather_2018, summer_weather_2020

# Gets the summer weather for 2018 and 2020 datasets.
summer_weather_2018, summer_weather_2020 = filter_weather(weather)

In [67]:
# Differentiates the information contained in the columns of the dataset into the following categories:
# Columns that identify the weather station and the date of the observation.
observation_columns = [
    'STATION',
    'NAME',
    'DATE'
]

# Columns about coordinate location of the observation.
coordinates = [
    'LATITUDE',
    'LONGITUDE',
    'ELEVATION'
]

# Columns about the weather conditions (snowfall).
snowfall_columns = [
    'SNOW', # Snowfall.
    'SNWD'  # Snow depth.
]

# Columns about the weather conditions (temperature).
temperature_columns = [
    'TAVG', # Average temperature.
    'TMAX', # Maximum temperature.
    'TMIN'  # Minimum temperature.
]

# Columns about the weather conditions (wind).
wind_columns = [
    'AWND', # Average daily wind speed.
    'WDF2', # Direction of the fastest 2-minute wind.
    'WDF5', # Direction of the fastest 5-second wind.
    'WSF2', # Fastest 2-minute wind speed.
    'WSF5'  # Fastest 5-second wind speed.
]

# Columns about the weather conditions (precipitation).
# Consider only 'PRCP' column.

In [68]:
def filter_columns_of_interest(summer_weather, columns_of_interest):
    """
        Filters the summer collisions to keep only the columns of interest.
    """

    # Gets the columns of no interest in.
    columns_to_drop = [col for col in summer_weather.columns if col not in columns_of_interest]

    # Drops the columns not in columns_of_interest.
    summer_weather = summer_weather.drop(columns=columns_to_drop)

    return summer_weather

In [69]:
# Defines columns of interest.
columns_of_interest = observation_columns + coordinates + snowfall_columns + temperature_columns + wind_columns + ['PRCP']

# Filter columns of interest for both summer_collisions_2018 and summer_collisions_2020.
summer_weather_2018 = filter_columns_of_interest(summer_weather_2018, columns_of_interest)
summer_weather_2020 = filter_columns_of_interest(summer_weather_2020, columns_of_interest)

In [70]:
summer_weather_2020.columns

Index(['STATION', 'NAME', 'LATITUDE', 'LONGITUDE', 'ELEVATION', 'DATE', 'AWND',
       'PRCP', 'SNOW', 'SNWD', 'TAVG', 'TMAX', 'TMIN', 'WDF2', 'WDF5', 'WSF2',
       'WSF5'],
      dtype='object')

In [71]:
def replace_nan_with_zero(summer_weather):
    # Replaces all NaN values with 0 values.
   summer_weather.fillna(0, inplace=True)
   return summer_weather

summer_weather_2018 = replace_nan_with_zero(summer_weather_2018)
summer_weather_2020 = replace_nan_with_zero(summer_weather_2020)

In [72]:
summer_weather_2020.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,SNOW,SNWD,TAVG,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5
880,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2020-06-01,0.0,0.0,0.0,0.0,0.0,20.0,8.9,0.0,0.0,0.0,0.0
881,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2020-06-02,0.0,0.0,0.0,0.0,0.0,21.1,8.9,0.0,0.0,0.0,0.0
882,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2020-06-03,0.0,18.5,0.0,0.0,0.0,22.8,10.0,0.0,0.0,0.0,0.0
883,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2020-06-04,0.0,5.1,0.0,0.0,0.0,26.1,15.6,0.0,0.0,0.0,0.0
884,USC00280907,"BOONTON 1 SE, NJ US",40.89174,-74.39635,85.3,2020-06-05,0.0,9.1,0.0,0.0,0.0,28.9,16.7,0.0,0.0,0.0,0.0


In [80]:
def encode_weather_conditions(summer_weather):
    # Creates new columns to define labels for weather conditions
    summer_weather['WINDY'] = (summer_weather['AWND'].gt(0) | summer_weather['WDF2'].gt(0) | summer_weather['WDF5'].gt(0) | summer_weather['WSF2'].gt(0) | summer_weather['WSF5'].gt(0))
    summer_weather['RAINY'] = summer_weather['PRCP'].gt(0)  # Considers only non-zero precipitation as rainy
    summer_weather['SNOWY'] = (summer_weather['SNOW'].gt(0) | summer_weather['SNWD'].gt(0))

    return summer_weather

In [74]:
summer_weather_2018 = encode_weather_conditions(summer_weather_2018)
summer_weather_2020 = encode_weather_conditions(summer_weather_2020)

In [75]:
# Averages = 0 doesn't inspire confidence, s.t. if TAVG or AWND = 0 -> remove
summer_weather_2018 = summer_weather_2018[(summer_weather_2018['TAVG'] != 0) & (summer_weather_2018['AWND'] != 0)]
summer_weather_2020 = summer_weather_2020[(summer_weather_2020['TAVG'] != 0) & (summer_weather_2020['AWND'] != 0)]

In [76]:
summer_weather_2020.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,SNOW,SNWD,...,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,WINDY,RAINY,SNOWY,SUNNY
3834,USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027,3.0,2020-06-01,4.5,0.0,0.0,0.0,...,22.2,12.2,320.0,320.0,9.8,13.4,True,False,False,False
3835,USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027,3.0,2020-06-02,3.7,0.0,0.0,0.0,...,24.4,17.2,270.0,250.0,8.1,10.3,True,False,False,False
3836,USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027,3.0,2020-06-03,4.0,4.8,0.0,0.0,...,27.8,17.2,350.0,250.0,10.7,14.3,True,True,False,False
3837,USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027,3.0,2020-06-04,3.2,3.6,0.0,0.0,...,31.1,20.6,210.0,210.0,8.1,10.7,True,True,False,False
3838,USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027,3.0,2020-06-05,3.9,6.9,0.0,0.0,...,28.3,20.6,170.0,170.0,8.1,10.3,True,True,False,False


In [77]:
summer_weather_2018.head()

Unnamed: 0,STATION,NAME,LATITUDE,LONGITUDE,ELEVATION,DATE,AWND,PRCP,SNOW,SNWD,...,TMAX,TMIN,WDF2,WDF5,WSF2,WSF5,WINDY,RAINY,SNOWY,SUNNY
3103,USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027,3.0,2018-06-01,2.9,1.3,0.0,0.0,...,27.8,17.2,170.0,50.0,5.8,8.9,True,True,False,False
3104,USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027,3.0,2018-06-02,4.1,0.3,0.0,0.0,...,30.6,19.4,110.0,100.0,10.3,12.1,True,True,False,False
3105,USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027,3.0,2018-06-03,5.8,7.6,0.0,0.0,...,19.4,11.7,100.0,90.0,10.7,12.5,True,True,False,False
3106,USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027,3.0,2018-06-04,4.4,8.9,0.0,0.0,...,22.2,11.1,50.0,50.0,8.9,10.3,True,True,False,False
3107,USW00014732,"LAGUARDIA AIRPORT, NY US",40.77945,-73.88027,3.0,2018-06-05,4.2,0.0,0.0,0.0,...,25.0,15.0,290.0,290.0,10.7,14.8,True,False,False,False


In [78]:
# Reset index for summer_collisions_2018 dataset.
summer_weather_2018.reset_index(drop=True, inplace=True)

# Reset index for summer_collisions_2020 dataset.
summer_weather_2020.reset_index(drop=True, inplace=True)

In [79]:
def save(data, file_path):
    """
        Saves the dataset as a pickle file.
    """

    with open(file_path, 'wb') as file:
        pickle.dump(data, file)
    print(f"The dataset has been saved in pickle format at {file_path}.")

save(summer_weather_2018, 'summer_weather_2018.pkl')
save(summer_weather_2020, 'summer_weather_2020.pkl')

The dataset has been saved in pickle format at summer_weather_2018.pkl.
The dataset has been saved in pickle format at summer_weather_2020.pkl.
