Import data from US_Accidents_March23_1_percent.csv, clean it and import to new csv file.  
Making one file where all NaN numbers are deleted: US_Accidents_March23_1_percent_cleaned.csv  
Making another file where all NaN numbers converted to 0: One_Percent_Null_To_Zero.csv   
We can use US_Accidents_March23_1_percent_cleaned.csv for our graphs and One_Percent_Null_To_Zero.csv if we want all data.  
Most of the NaN values were found in latitude and longitude cells, so it's OK if we delete whose records.

# Imports Licenesed_drivers_By_State.csv, clean it and make df.

In [None]:
# Import Dependencies
import os
import pandas as pd
from pathlib import Path


In [None]:
#Read the csv Licenesed_drivers_By_State.csv file and make a data frame
Drivers_path = Path("data/Licensed_drivers_By_State.csv")
Drivers_df = pd.read_csv(Drivers_path)
Drivers_df.head()

In [None]:
#Mapping all states for converting full names to abbriviations.
#We need this so state names in both CSV files have the same column States
state_mapping = {
    'AL': 'Alabama', 'AK': 'Alaska', 'AZ': 'Arizona', 'AR': 'Arkansas', 'CA': 'California',
    'CO': 'Colorado', 'CT': 'Connecticut', 'DE': 'Delaware', 'FL': 'Florida', 'GA': 'Georgia',
    'HI': 'Hawaii', 'ID': 'Idaho', 'IL': 'Illinois', 'IN': 'Indiana', 'IA': 'Iowa',
    'KS': 'Kansas', 'KY': 'Kentucky', 'LA': 'Louisiana', 'ME': 'Maine', 'MD': 'Maryland',
    'MA': 'Massachusetts', 'MI': 'Michigan', 'MN': 'Minnesota', 'MS': 'Mississippi',
    'MO': 'Missouri', 'MT': 'Montana', 'NE': 'Nebraska', 'NV': 'Nevada', 'NH': 'New Hampshire',
    'NJ': 'New Jersey', 'NM': 'New Mexico', 'NY': 'New York', 'NC': 'North Carolina',
    'ND': 'North Dakota', 'OH': 'Ohio', 'OK': 'Oklahoma', 'OR': 'Oregon', 'PA': 'Pennsylvania',
    'RI': 'Rhode Island', 'SC': 'South Carolina', 'SD': 'South Dakota', 'TN': 'Tennessee',
    'TX': 'Texas', 'UT': 'Utah', 'VT': 'Vermont', 'VA': 'Virginia', 'WA': 'Washington',
    'WV': 'West Virginia', 'WI': 'Wisconsin', 'WY': 'Wyoming'
}

reverse_state_mapping = {
    value: key
    for key, value in state_mapping.items()
}

In [None]:
#Converting state full name to abbreviation
Drivers_df['State'] = Drivers_df['State'].map(reverse_state_mapping)
Drivers_df.head()

In [None]:
# Group by 'State' and 'Year', then calculate the total number of drivers
state_population = Drivers_df.groupby(['State', 'Year'])['Drivers'].sum().reset_index()

# Rename the column for clarity (optional)
state_population.rename(columns={'Drivers': 'Total_Drivers'}, inplace=True)

state_population

In [None]:
#Filter out all years exept 2016-2023 to match with accidents data
Drivers_filtered = Drivers_df[(Drivers_df['Year'] >= 2016) & (Drivers_df['Year'] <= 2023)]
Drivers_filtered.head()

In [None]:
Drivers_filtered.info()

In [None]:
#Printing only 2018 drivers. There are several age groups in every state.
Drivers_2018 = Drivers_df[Drivers_df['Year'] == 2018]
Drivers_2018.head()

In [None]:
#Calculating number of drivers per state
#Using only 2018 data as it's the one more complete.
#Assuming rough driver count is the same year by year. We need this data to calculate accidents per population.
Drivers_by_State = Drivers_2018.groupby('State')['Drivers'].sum().reset_index()
Drivers_by_State.columns = ['State', 'Drivers']

Drivers_by_State.head()

In [None]:
#To csv file
#Use this data for SQL
Drivers_by_State.to_csv('data/Licensed_Drivers_clean.csv', index=False)

## Working on Accidents data

In [None]:
#Read the csv file and make a data frame
US_Accidents_path = Path("data/US_Accidents_March23_1_percent.csv")
US_Accidents_df = pd.read_csv(US_Accidents_path)
US_Accidents_df.head()

In [None]:
#Create new JSON file for US_Accidents_March23_1_percent
# Load the CSV file
csv_file_path = 'data/US_Accidents_March23_1_percent.csv'
df = pd.read_csv(csv_file_path)

# Convert to JSON
json_file_path = 'data/US_Accidents_March23_1_percent_JSON.json'
df.to_json(json_file_path, orient='records', lines=True)

In [None]:
#Get all columns
US_Accidents_df.info()

In [None]:
#Deleting columns that we don't need
#Columns to delete:
#<!-- Source, End_Lat, End_Lng, Distance(mi), Country, Weather_Timestamp, Civil_Twilight, Nautical_Twilight, Astronomical_Twilight -->
US_Accidents_df_copy = US_Accidents_df.copy()

columns_to_delete = [
    'Source', 'End_Lat', 'End_Lng', 'Distance(mi)', 
    'Country', 'Weather_Timestamp', 'Civil_Twilight',
     'Nautical_Twilight', 'Astronomical_Twilight'
]

US_Accidents_df_copy = US_Accidents_df_copy.drop(columns=columns_to_delete)
US_Accidents_df_copy.head()

In [None]:
#Delete all NaN data
US_Accidents_cleaned = US_Accidents_df_copy.dropna()
US_Accidents_cleaned.head()

In [None]:
US_Accidents_cleaned.info()

In [None]:
#Create new csv file for cleaned data
US_Accidents_cleaned.to_csv("data/US_Accidents_March23_1_percent_cleaned.csv", index=False)

In [None]:
#Create new JSON file for cleaned data
# Load the CSV file
csv_file_path = 'data/US_Accidents_March23_1_percent_cleaned.csv'
df = pd.read_csv(csv_file_path)

# Convert to JSON
json_file_path = 'data/US_Accidents_March23_1_percent_cleaned.json'
df.to_json(json_file_path, orient='records', lines=True)

In [None]:
#Making new data with all NaN converted to 0. When we delete all NaN we delete the whole row. 
# Making data with these rows but NaN converted to 0.
US_Accidents_Null_To_Zero = US_Accidents_df_copy.copy()
US_Accidents_Null_To_Zero = US_Accidents_Null_To_Zero.fillna(0)
US_Accidents_Null_To_Zero.head()


In [None]:
US_Accidents_Null_To_Zero.info()

In [None]:
#Create new csv file for cleaned data
US_Accidents_Null_To_Zero.to_csv("data/One_Percent_Null_To_Zero.csv", index=False)

In [None]:
#Create new JSON file for cleaned data
# Load the CSV file
csv_file_path = 'data/One_percent_Null_To_Zero.csv'
df = pd.read_csv(csv_file_path)

# Convert to JSON
json_file_path = 'data/One_percent_Null_To_Zero_JSON.json'
df.to_json(json_file_path, orient='records', lines=True)

In [None]:
#Make 2 columns at the end: Date and Year. Convert Start_Time to datetime format
accidents_df = pd.read_csv("data/One_Percent_Null_To_Zero.csv")
accidents_df['Date'] =  pd.to_datetime(accidents_df['Start_Time'], format ='mixed')
accidents_df['Year'] = accidents_df.Date.dt.year

print(accidents_df.columns)
accidents_df.head()

In [None]:
#Use this data for SQL.
accidents_df.to_csv('data/Accidents.csv', index=False)