# Import dependencies

In [1]:
# Dependencies
import pandas as pd
import numpy as np
import requests
from census import Census

# Census API Key
from config import api_key
#change year as needed
c = Census(api_key, year=2015)

# SQLAlchemy
from sqlalchemy import create_engine

# Kamran - Accidents from 2016-2018

# Store CSV into DataFrame

In [2]:
accident_csv_file = "Resources/US_Accidents_June20.csv"
accident_data_df = pd.read_csv(accident_csv_file)
accident_data_df.head()

Unnamed: 0,ID,Source,TMC,Severity,Start_Time,End_Time,Start_Lat,Start_Lng,End_Lat,End_Lng,...,Roundabout,Station,Stop,Traffic_Calming,Traffic_Signal,Turning_Loop,Sunrise_Sunset,Civil_Twilight,Nautical_Twilight,Astronomical_Twilight
0,A-1,MapQuest,201.0,3,2016-02-08 05:46:00,2016-02-08 11:00:00,39.865147,-84.058723,,,...,False,False,False,False,False,False,Night,Night,Night,Night
1,A-2,MapQuest,201.0,2,2016-02-08 06:07:59,2016-02-08 06:37:59,39.928059,-82.831184,,,...,False,False,False,False,False,False,Night,Night,Night,Day
2,A-3,MapQuest,201.0,2,2016-02-08 06:49:27,2016-02-08 07:19:27,39.063148,-84.032608,,,...,False,False,False,False,True,False,Night,Night,Day,Day
3,A-4,MapQuest,201.0,3,2016-02-08 07:23:34,2016-02-08 07:53:34,39.747753,-84.205582,,,...,False,False,False,False,False,False,Night,Day,Day,Day
4,A-5,MapQuest,201.0,2,2016-02-08 07:39:07,2016-02-08 08:09:07,39.627781,-84.188354,,,...,False,False,False,False,True,False,Day,Day,Day,Day


# state_df

In [3]:
state_csv_file = "Resources/state_csvData.csv"
state_df = pd.read_csv(state_csv_file)
state_df.head()

Unnamed: 0,State,Abbrev,Code
0,Alabama,Ala.,AL
1,Alaska,Alaska,AK
2,Arizona,Ariz.,AZ
3,Arkansas,Ark.,AR
4,California,Calif.,CA


In [4]:
# Rename columns to match accident df for future merging
state_df = state_df.drop(columns={"Abbrev"})
state_df = state_df.rename(columns={"State": "state_name",
                                   "Code": "State"})
state_df.head()

Unnamed: 0,state_name,State
0,Alabama,AL
1,Alaska,AK
2,Arizona,AZ
3,Arkansas,AR
4,California,CA


# accident_df

In [5]:
# view data before transformation
accident_data_df.nunique()

ID                       3513617
Source                         3
TMC                           21
Severity                       4
Start_Time               3200042
End_Time                 3246120
Start_Lat                1124695
Start_Lng                1113407
End_Lat                   375074
End_Lng                   383569
Distance(mi)               13476
Description              1780092
Number                     40365
Street                    176262
Side                           3
City                       11895
County                      1724
State                         49
Zipcode                   418780
Country                        1
Timezone                       4
Airport_Code                2001
Weather_Timestamp         546086
Temperature(F)               831
Wind_Chill(F)                974
Humidity(%)                  100
Pressure(in)                1022
Visibility(mi)                85
Wind_Direction                24
Wind_Speed(mph)              160
Precipitat

In [6]:
accident_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3513617 entries, 0 to 3513616
Data columns (total 49 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   ID                     object 
 1   Source                 object 
 2   TMC                    float64
 3   Severity               int64  
 4   Start_Time             object 
 5   End_Time               object 
 6   Start_Lat              float64
 7   Start_Lng              float64
 8   End_Lat                float64
 9   End_Lng                float64
 10  Distance(mi)           float64
 11  Description            object 
 12  Number                 float64
 13  Street                 object 
 14  Side                   object 
 15  City                   object 
 16  County                 object 
 17  State                  object 
 18  Zipcode                object 
 19  Country                object 
 20  Timezone               object 
 21  Airport_Code           object 
 22  Weather_Timestamp 

# Create new accident df

In [7]:
new_accident_data_df = accident_data_df[['ID', 'Start_Time', 'City', 'County', 'State', 'Zipcode', 'Street', 'Timezone', 'Temperature(F)', 'Visibility(mi)', 'Weather_Condition']].copy()
new_accident_data_df.head()

Unnamed: 0,ID,Start_Time,City,County,State,Zipcode,Street,Timezone,Temperature(F),Visibility(mi),Weather_Condition
0,A-1,2016-02-08 05:46:00,Dayton,Montgomery,OH,45424,I-70 E,US/Eastern,36.9,10.0,Light Rain
1,A-2,2016-02-08 06:07:59,Reynoldsburg,Franklin,OH,43068-3402,Brice Rd,US/Eastern,37.9,10.0,Light Rain
2,A-3,2016-02-08 06:49:27,Williamsburg,Clermont,OH,45176,State Route 32,US/Eastern,36.0,10.0,Overcast
3,A-4,2016-02-08 07:23:34,Dayton,Montgomery,OH,45417,I-75 S,US/Eastern,35.1,9.0,Mostly Cloudy
4,A-5,2016-02-08 07:39:07,Dayton,Montgomery,OH,45459,Miamisburg Centerville Rd,US/Eastern,36.0,6.0,Mostly Cloudy


In [8]:
# us_state_abbrev = {
#    'Alabama': 'AL',
#    'Alaska': 'AK',
#    'Arizona': 'AZ',
#    'Arkansas': 'AR',
#    'California': 'CA',
#    'Colorado': 'CO',
#    'Connecticut': 'CT',
#    'Delaware': 'DE',
#    'Florida': 'FL',
#    'Georgia': 'GA',
#    'Hawaii': 'HI',
#    'Idaho': 'ID',
#    'Illinois': 'IL',
#    'Indiana': 'IN',
#    'Iowa': 'IA',
#    'Kansas': 'KS',
#    'Kentucky': 'KY',
#    'Louisiana': 'LA',
#    'Maine': 'ME',
#    'Maryland': 'MD',
#    'Massachusetts': 'MA',
#    'Michigan': 'MI',
#    'Minnesota': 'MN',
#    'Mississippi': 'MS',
#    'Missouri': 'MO',
#    'Montana': 'MT',
#    'Nebraska': 'NE',
#    'Nevada': 'NV',
#    'New Hampshire': 'NH',
#    'New Jersey': 'NJ',
#    'New Mexico': 'NM',
#    'New York': 'NY',
#    'North Carolina': 'NC',
#    'North Dakota': 'ND',
#    'Ohio': 'OH',
#    'Oklahoma': 'OK',
#    'Oregon': 'OR',
#    'Pennsylvania': 'PA',
#    'Rhode Island': 'RI',
#    'South Carolina': 'SC',
#    'South Dakota': 'SD',
#    'Tennessee': 'TN',
#    'Texas': 'TX',
#    'Utah': 'UT',
#    'Vermont': 'VT',
#    'Virginia': 'VA',
#    'Washington': 'WA',
#    'West Virginia': 'WV',
#    'Wisconsin': 'WI',
#    'Wyoming': 'WY',
# }
# for key, value in us_state_abbrev.items():
#     new_accident_data_df.loc[new_accident_data_df['State']==key,'State'] = value
    
# new_accident_data_df = new_accident_data_df.append(value, ignore_index=True)
# new_accident_data_df.head()

# merge state and accident df

In [9]:
# Merge accident_df with state_df to obtain state_name for future merging with population df
clean_accident_data_df = pd.merge(new_accident_data_df, state_df, on= "State")
clean_accident_data_df.head()

Unnamed: 0,ID,Start_Time,City,County,State,Zipcode,Street,Timezone,Temperature(F),Visibility(mi),Weather_Condition,state_name
0,A-1,2016-02-08 05:46:00,Dayton,Montgomery,OH,45424,I-70 E,US/Eastern,36.9,10.0,Light Rain,Ohio
1,A-2,2016-02-08 06:07:59,Reynoldsburg,Franklin,OH,43068-3402,Brice Rd,US/Eastern,37.9,10.0,Light Rain,Ohio
2,A-3,2016-02-08 06:49:27,Williamsburg,Clermont,OH,45176,State Route 32,US/Eastern,36.0,10.0,Overcast,Ohio
3,A-4,2016-02-08 07:23:34,Dayton,Montgomery,OH,45417,I-75 S,US/Eastern,35.1,9.0,Mostly Cloudy,Ohio
4,A-5,2016-02-08 07:39:07,Dayton,Montgomery,OH,45459,Miamisburg Centerville Rd,US/Eastern,36.0,6.0,Mostly Cloudy,Ohio


In [10]:
# convert start_time from object to datetime 
clean_accident_data_df['Start_Time'] = pd.to_datetime(clean_accident_data_df['Start_Time'])
clean_accident_data_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3513617 entries, 0 to 3513616
Data columns (total 12 columns):
 #   Column             Dtype         
---  ------             -----         
 0   ID                 object        
 1   Start_Time         datetime64[ns]
 2   City               object        
 3   County             object        
 4   State              object        
 5   Zipcode            object        
 6   Street             object        
 7   Timezone           object        
 8   Temperature(F)     float64       
 9   Visibility(mi)     float64       
 10  Weather_Condition  object        
 11  state_name         object        
dtypes: datetime64[ns](1), float64(2), object(9)
memory usage: 348.5+ MB


In [11]:
clean_accident_data_df.nunique()

ID                   3513617
Start_Time           3200042
City                   11895
County                  1724
State                     49
Zipcode               418780
Street                176262
Timezone                   4
Temperature(F)           831
Visibility(mi)            85
Weather_Condition        127
state_name                49
dtype: int64

In [12]:
# Limit data from 2016-2020 to 2016 only
clean_accident_data_df = clean_accident_data_df[(clean_accident_data_df['Start_Time']>= "2016-01-01") & 
                                            (clean_accident_data_df['Start_Time']<= "2018-12-31")]
clean_accident_data_df.nunique()

ID                   2018197
Start_Time           1875832
City                   10793
County                  1658
State                     49
Zipcode               265911
Street                128158
Timezone                   4
Temperature(F)           780
Visibility(mi)            71
Weather_Condition         95
state_name                49
dtype: int64

In [13]:
clean_accident_data_df.head()

Unnamed: 0,ID,Start_Time,City,County,State,Zipcode,Street,Timezone,Temperature(F),Visibility(mi),Weather_Condition,state_name
0,A-1,2016-02-08 05:46:00,Dayton,Montgomery,OH,45424,I-70 E,US/Eastern,36.9,10.0,Light Rain,Ohio
1,A-2,2016-02-08 06:07:59,Reynoldsburg,Franklin,OH,43068-3402,Brice Rd,US/Eastern,37.9,10.0,Light Rain,Ohio
2,A-3,2016-02-08 06:49:27,Williamsburg,Clermont,OH,45176,State Route 32,US/Eastern,36.0,10.0,Overcast,Ohio
3,A-4,2016-02-08 07:23:34,Dayton,Montgomery,OH,45417,I-75 S,US/Eastern,35.1,9.0,Mostly Cloudy,Ohio
4,A-5,2016-02-08 07:39:07,Dayton,Montgomery,OH,45459,Miamisburg Centerville Rd,US/Eastern,36.0,6.0,Mostly Cloudy,Ohio


In [14]:
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!! Didn't set ID as Index, ID is acting weird (ask team members) !!!!!!!!!!!!!!!!!!!!!!!!!!
clean_accident_data_df = clean_accident_data_df.sort_values(by='ID')
clean_accident_data_df.head()

Unnamed: 0,ID,Start_Time,City,County,State,Zipcode,Street,Timezone,Temperature(F),Visibility(mi),Weather_Condition,state_name
0,A-1,2016-02-08 05:46:00,Dayton,Montgomery,OH,45424,I-70 E,US/Eastern,36.9,10.0,Light Rain,Ohio
9,A-10,2016-02-08 08:10:04,Westerville,Franklin,OH,43081,Westerville Rd,US/Eastern,37.4,3.0,Light Rain,Ohio
99,A-100,2016-02-11 08:13:24,Dayton,Montgomery,OH,45410,US-35 E,US/Eastern,7.5,10.0,Scattered Clouds,Ohio
68791,A-1000,2016-06-23 10:31:12,El Dorado Hills,El Dorado,CA,95762,Latrobe Rd,US/Pacific,77.0,10.0,Clear,California
77791,A-10000,2017-01-06 16:22:04,West Sacramento,Yolo,CA,95691,I-80 W,US/Pacific,46.0,10.0,Clear,California


# Check to see if all 2016 data is present

In [15]:
# Sort by datetime
clean_accident_data_df = clean_accident_data_df.sort_values('Start_Time')
clean_accident_data_df.head()

Unnamed: 0,ID,Start_Time,City,County,State,Zipcode,Street,Timezone,Temperature(F),Visibility(mi),Weather_Condition,state_name
45629,A-2478859,2016-02-08 00:37:08,Dublin,Franklin,OH,43017,Outerbelt E,US/Eastern,42.1,10.0,Light Rain,Ohio
0,A-1,2016-02-08 05:46:00,Dayton,Montgomery,OH,45424,I-70 E,US/Eastern,36.9,10.0,Light Rain,Ohio
45630,A-2478860,2016-02-08 05:56:20,Dayton,Montgomery,OH,45424,I-70 E,US/Eastern,36.9,10.0,Light Rain,Ohio
1,A-2,2016-02-08 06:07:59,Reynoldsburg,Franklin,OH,43068-3402,Brice Rd,US/Eastern,37.9,10.0,Light Rain,Ohio
45631,A-2478861,2016-02-08 06:15:39,Cincinnati,Hamilton,OH,45203,I-75 S,US/Eastern,36.0,10.0,Overcast,Ohio


In [16]:
clean_accident_data_df.tail()

Unnamed: 0,ID,Start_Time,City,County,State,Zipcode,Street,Timezone,Temperature(F),Visibility(mi),Weather_Condition,state_name
1120468,A-3229937,2018-12-30 23:49:45,Orlando,Orange,FL,32832,Narcoossee Rd,US/Eastern,70.0,10.0,Scattered Clouds,Florida
1120467,A-3229936,2018-12-30 23:49:45,Orlando,Orange,FL,32832,Narcoossee Rd,US/Eastern,70.0,10.0,Scattered Clouds,Florida
3478703,A-3230370,2018-12-30 23:51:03,Wyoming,Chisago,MN,55092,I-35 N,US/Central,30.0,10.0,Overcast,Minnesota
2786921,A-3230568,2018-12-30 23:57:58,Astoria,Clatsop,OR,97103-2301,43rd St,US/Pacific,33.1,2.5,Clear,Oregon
1120485,A-3230429,2018-12-30 23:58:45,Defuniak Springs,Walton,FL,32435,I-10 E,US/Central,66.4,4.0,Overcast,Florida


In [17]:
clean_accident_data_df.count()

ID                   2018197
Start_Time           2018197
City                 2018133
County               2018197
State                2018197
Zipcode              2017659
Street               2018197
Timezone             2016487
Temperature(F)       1979434
Visibility(mi)       1971104
Weather_Condition    1970670
state_name           2018197
dtype: int64

In [18]:
clean_accident_data_df.nunique()

ID                   2018197
Start_Time           1875832
City                   10793
County                  1658
State                     49
Zipcode               265911
Street                128158
Timezone                   4
Temperature(F)           780
Visibility(mi)            71
Weather_Condition         95
state_name                49
dtype: int64

# Test to see if correct data is included

In [19]:
# clean_accident_data_df.loc[clean_accident_data_df['ID']=='A-9152']

In [20]:
# clean_accident_data_df.loc[clean_accident_data_df['ID']=='A-59918']

In [21]:
# clean_accident_data_df.loc[clean_accident_data_df['ID']=='A-9153']

In [22]:
# clean_accident_data_df.loc[clean_accident_data_df['Start_Time']=='2016-12-31 23:42:02']

In [23]:
clean_accident_data_df['normalised_date'] = clean_accident_data_df['Start_Time'].dt.normalize()
clean_accident_data_df.head()

Unnamed: 0,ID,Start_Time,City,County,State,Zipcode,Street,Timezone,Temperature(F),Visibility(mi),Weather_Condition,state_name,normalised_date
45629,A-2478859,2016-02-08 00:37:08,Dublin,Franklin,OH,43017,Outerbelt E,US/Eastern,42.1,10.0,Light Rain,Ohio,2016-02-08
0,A-1,2016-02-08 05:46:00,Dayton,Montgomery,OH,45424,I-70 E,US/Eastern,36.9,10.0,Light Rain,Ohio,2016-02-08
45630,A-2478860,2016-02-08 05:56:20,Dayton,Montgomery,OH,45424,I-70 E,US/Eastern,36.9,10.0,Light Rain,Ohio,2016-02-08
1,A-2,2016-02-08 06:07:59,Reynoldsburg,Franklin,OH,43068-3402,Brice Rd,US/Eastern,37.9,10.0,Light Rain,Ohio,2016-02-08
45631,A-2478861,2016-02-08 06:15:39,Cincinnati,Hamilton,OH,45203,I-75 S,US/Eastern,36.0,10.0,Overcast,Ohio,2016-02-08


In [24]:
clean_accident_data_df.loc[clean_accident_data_df['normalised_date']=='2018-12-30']

Unnamed: 0,ID,Start_Time,City,County,State,Zipcode,Street,Timezone,Temperature(F),Visibility(mi),Weather_Condition,state_name,normalised_date
59215,A-3229569,2018-12-30 00:00:03,Cincinnati,Hamilton,OH,45215,I-75 S,US/Eastern,35.1,10.0,Overcast,Ohio,2018-12-30
388708,A-1487670,2018-12-30 00:00:56,Santa Ana,Orange,CA,92701,I-5 N,US/Pacific,48.0,10.0,Clear,California,2018-12-30
2303361,A-3229570,2018-12-30 00:07:45,Aldie,Loudoun,VA,20105-1903,James Monroe Hwy,US/Eastern,33.8,10.0,Clear,Virginia,2018-12-30
388709,A-1487671,2018-12-30 00:07:49,Los Angeles,Los Angeles,CA,90012,W 3rd St,US/Pacific,46.0,10.0,Clear,California,2018-12-30
2712753,A-3230036,2018-12-30 00:10:00,Seattle,King,WA,98155,I-5 N,US/Pacific,44.1,10.0,Overcast,Washington,2018-12-30
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1120468,A-3229937,2018-12-30 23:49:45,Orlando,Orange,FL,32832,Narcoossee Rd,US/Eastern,70.0,10.0,Scattered Clouds,Florida,2018-12-30
1120467,A-3229936,2018-12-30 23:49:45,Orlando,Orange,FL,32832,Narcoossee Rd,US/Eastern,70.0,10.0,Scattered Clouds,Florida,2018-12-30
3478703,A-3230370,2018-12-30 23:51:03,Wyoming,Chisago,MN,55092,I-35 N,US/Central,30.0,10.0,Overcast,Minnesota,2018-12-30
2786921,A-3230568,2018-12-30 23:57:58,Astoria,Clatsop,OR,97103-2301,43rd St,US/Pacific,33.1,2.5,Clear,Oregon,2018-12-30


# Reorganize df to look pretty

In [25]:
clean_accident_data_df = clean_accident_data_df[['ID', 'Start_Time', 'City', 'County', 'State', 'state_name', 'Zipcode', 'Street', 'Timezone', 'Temperature(F)', 'Visibility(mi)', 'Weather_Condition']]
clean_accident_data_df.head()

Unnamed: 0,ID,Start_Time,City,County,State,state_name,Zipcode,Street,Timezone,Temperature(F),Visibility(mi),Weather_Condition
45629,A-2478859,2016-02-08 00:37:08,Dublin,Franklin,OH,Ohio,43017,Outerbelt E,US/Eastern,42.1,10.0,Light Rain
0,A-1,2016-02-08 05:46:00,Dayton,Montgomery,OH,Ohio,45424,I-70 E,US/Eastern,36.9,10.0,Light Rain
45630,A-2478860,2016-02-08 05:56:20,Dayton,Montgomery,OH,Ohio,45424,I-70 E,US/Eastern,36.9,10.0,Light Rain
1,A-2,2016-02-08 06:07:59,Reynoldsburg,Franklin,OH,Ohio,43068-3402,Brice Rd,US/Eastern,37.9,10.0,Light Rain
45631,A-2478861,2016-02-08 06:15:39,Cincinnati,Hamilton,OH,Ohio,45203,I-75 S,US/Eastern,36.0,10.0,Overcast


In [26]:
# clean_accident_data_df.set_index("ID", inplace=True)

In [27]:
# clean_accident_data_df.head()

In [28]:
clean_accident_data_df = clean_accident_data_df.rename(columns={"State": "state_abbrev",
                                                               "state_name":"State",
                                                               "Start_Time": "Date_Time"})


# Tunde - Population from 2015-2018

In [36]:
#make API call to Census API for all US county estimated population for 2015
c = Census(api_key, year=2015)

In [37]:
county_pop_data= c.acs5.get(("NAME","B01001_001E"), {"for": "county:*"})
#2015

CensusException: <html><head><title>Error report</title></head><body><h1>HTTP Status 404 - /data/2015/acs5</h1></body></html>

In [35]:
county_pop15_df = pd.DataFrame(county_pop_data)
county_pop15_df.rename(columns = {"B01001_001E":"2015"}, inplace=True)
county_pop15_df.head()

CensusException: <html><head><title>Error report</title></head><body><h1>HTTP Status 404 - /data/2015/acs5</h1></body></html>

In [None]:
#2015
county_pop15_df = pd.DataFrame(county_pop_data)
county_pop15_df.rename(columns = {"B01001_001E":"2015"}, inplace=True)
county_pop15_df.head()

In [None]:
#2016
c = Census(api_key, year=2016)
county_pop_data= c.acs5.get(("NAME","B01001_001E"), {"for": "county:*"})

In [None]:
county_pop16_df = pd.DataFrame(county_pop_data)
county_pop16_df.rename(columns = {"B01001_001E":"2016"}, inplace=True)
county_pop16_df.head()

In [None]:
#2017
c = Census(api_key, year=2017)
county_pop_data= c.acs5.get(("NAME","B01001_001E"), {"for": "county:*"})

In [None]:
county_pop17_df = pd.DataFrame(county_pop_data)
county_pop17_df.rename(columns = {"B01001_001E":"2017"}, inplace=True)
county_pop17_df.head()

In [None]:
#2018
c = Census(api_key, year=2018)
county_pop_data= c.acs5.get(("NAME","B01001_001E"), {"for": "county:*"})

In [None]:
county_pop18_df = pd.DataFrame(county_pop_data)
county_pop18_df.rename(columns = {"B01001_001E":"2018"}, inplace=True)
county_pop18_df.head()

In [None]:
df_1516 = pd.merge(county_pop15_df, county_pop16_df, on="NAME")

In [None]:
df_1718 = pd.merge(county_pop17_df, county_pop18_df, on="NAME")

In [None]:
county_pop_df = pd.merge(df_1516, df_1718, on="NAME")

In [None]:
county_pop_df.head()

In [None]:
county_pop_df.drop(columns=["state_x_x","county_x_x","state_y_x","county_y_x","state_x_y","county_x_y"], inplace=True)


In [None]:
county_pop_df.rename(columns = {"NAME":"county", "state_y_y":"state_id", "county_y_y":"county_id"}, inplace=True)

In [None]:
county_pop_df = county_pop_df[["county","county_id","state_id","2015","2016","2017","2018"]]
county_pop_df.head()

In [None]:
county_pop_df.info()

In [None]:
county_pop_df.count()

In [None]:
#make another call for state population just to get a list of states since county infor only have state census id
state_pop_data= c.acs5.get(("NAME","B01001_001E"), {"for": "state:*"})

In [None]:
#pass on the state pop data to a df
state_pop_df = pd.DataFrame(state_pop_data)
state_pop_df.head()

In [None]:
#drop population column from the state pop data df
state_pop1_df = state_pop_df.drop(columns=["B01001_001E"])

In [None]:
state_pop1_df.head()

In [None]:
#rename "Name" column to prevent duplication when merging with county pop df
state_pop1_df.rename(columns = {"NAME":"state", "state":"state_id"}, inplace=True)

In [None]:
state_pop1_df.head()

In [None]:
#merge state and county df on the "state" the numeric identifier for states
county_pop1_df = pd.merge(county_pop_df, state_pop1_df, on="state_id")

In [None]:
county_pop1_df.head()

In [None]:
#reorder columns 
county_pop1_df = county_pop1_df[["county","county_id","state","state_id","2015","2016","2017","2018"]]

In [None]:
county_pop1_df.info()

In [None]:
#convert population columns to numeric
county_pop1_df[["2015","2016","2017","2018"]] = county_pop1_df[["2015","2016","2017","2018"]].apply(pd.to_numeric)

In [None]:
county_pop1_df.info()

In [None]:
#import US state 2 letter abbreviation dictionary to create a state table 
us_state_abbrev = {
    'Alabama': 'AL',
    'Alaska': 'AK',
    'American Samoa': 'AS',
    'Arizona': 'AZ',
    'Arkansas': 'AR',
    'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'District of Columbia': 'DC',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Guam': 'GU',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Northern Mariana Islands':'MP',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Puerto Rico': 'PR',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virgin Islands': 'VI',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}

In [None]:
state_series = pd.Series(us_state_abbrev)

In [None]:
# print(state_series)

In [None]:
state_df = state_series.to_frame()

In [None]:
state_df.head()

In [None]:
state_df.reset_index(inplace=True)

In [None]:
state_df.head()

In [None]:
state_df.rename(columns={"index":"state",0:"statecode"}, inplace=True)

In [None]:
state1_df = pd.merge(state_df,state_pop1_df, on="state" )

In [None]:
#County Estimated Population 2015 to 2018
county_pop1_df.head()

In [None]:
#STATE TABLE
state1_df.head()

# Kamran edit population

In [None]:
county_pop1_df.rename(columns = {"2015":"2015 Population", 
                                "2016":"2016 Population",
                                "2017":"2017 Population",
                                "2018":"2018 Population"}, inplace=True)
county_pop1_df.head(300)

In [None]:
clean_pop_df = county_pop1_df.copy()
clean_pop_df.head()

In [None]:
# Cleaned county column
clean_pop_df['county'] = clean_pop_df['county'].str.split(",", expand=True)[0]
clean_pop_df['county'] = clean_pop_df['county'].str.replace('County', '')
clean_pop_df.head()

In [None]:
clean_pop_df.head(50)

In [None]:
# clean_pop_df = clean_pop_df.drop(columns={"state"})
clean_pop_df = clean_pop_df.rename(columns={"county": "County",
                                           "state": "State"})
clean_pop_df.head()

## Extract,Clean,Transform DL data 

In [None]:
#Extract Licensed Drivers by state (DL) csv into pandas DF for year 2014,2015,2016
DL_DF = pd.read_csv("Resources/DL.csv", usecols=[0,66,67,68])
DL_DF

In [None]:
# clean Licensed Drivers by state (DL)
DL_DF_Clean = DL_DF.rename(columns={'2014':'num_licenced_driver_2014',
                                   '2015':'num_licenced_driver_2015',
                                   '2016':'num_licenced_driver_2016',
                                    'STATE':'state_name'})
DL_DF_Clean                       

## Extract,Clean,Transform RMV data 

In [None]:
#Extract Registered Motor Vehicles(RMV) csv into pandas DF
RMV_DF = pd.read_csv("Resources/RMV.csv", usecols=[0,15], skiprows=4)
RMV_DF

In [None]:
# clean egistered Motor Veicheles(RMV)
RMV_DF_Clean = RMV_DF.rename(columns={'Unnamed: 0':'state_name','Unnamed: 15':'num_reg_vehicle'})
RMV_DF_Clean

In [None]:
RMV_DF_Clean.head()

In [None]:
RMV_DF_Clean.to_csv("Resources/reg_vehicle.csv", index=False)

In [None]:
DL_DF_Clean.head()

In [None]:
DL_DF_Clean.to_csv("Resources/license.csv", index=False)

In [None]:
clean_pop_df.head()

In [None]:
clean_pop_df_rename = clean_pop_df.copy()
clean_pop_df_rename = clean_pop_df_rename.rename(columns= {"2015 Population": "population_2015",
                                   "2016 Population": "population_2016",
                                   "2017 Population": "population_2017",
                                   "2018 Population": "population_2018",
                                   "County": "county"})
# clean_pop_df_rename = clean_pop_df_rename.drop(columns= {"State", "state_id", "county_id"}, inplace=True)

In [None]:
clean_pop_df_rename.head()

In [None]:
clean_pop_df_rename = clean_pop_df_rename.sort_values(by='State')

In [None]:
clean_pop_df_rename.head()

In [None]:
clean_pop_df_rename.drop(columns= ["State", "state_id", "county_id"], inplace=True)

In [None]:
clean_pop_df_rename.head()

In [None]:
clean_pop_df_rename= clean_pop_df_rename[["county", "population_2015", "population_2016","population_2017", "population_2018"]]
clean_pop_df_rename.head()

In [None]:
clean_pop_df_rename.nunique()

In [None]:
clean_pop_df_rename = clean_pop_df_rename.sort_values

In [None]:
clean_pop_df_rename.loc[clean_pop_df_rename['county']=='Clark']

In [None]:
clean_pop_df_rename.to_csv("Resources/population.csv", index=False)

In [None]:
clean_accident_data_df.head()

In [None]:
clean_accident_data_df_rename = clean_accident_data_df.copy()
clean_accident_data_df_rename = clean_accident_data_df_rename.rename(columns= {"ID": "id",
                                                                              "Date_Time": "datetime",
                                                                              "City": "city",
                                                                              "County": "county",
                                                                              "State": "state_name",
                                                                              "Zipcode": "zipcode",
                                                                              "Timezone": "timezone",
                                                                              "Temperature(F)": "temperature",
                                                                               "Visibility(mi)": "visibility",
                                                                               "Weather_Condition": "weather_condition"})
# drop street and state abbrev


In [None]:
clean_accident_data_df_rename.drop(columns= ["state_abbrev", "Street"], inplace=True)

In [None]:
clean_accident_data_df_rename.head()

In [None]:
# population_id_df = clean_pop_df_rename.copy()
# population_id_df = population_id_df[['id', 'county']]
# population_id_df.head()

In [None]:
# population_id_df.nunique()

In [None]:
# population_id_df.count()

In [None]:
# population_id_df = population_id_df.rename(columns={'id': "population_id"})
# population_id_df.head()

In [None]:
# clean_accident_data_df_rename = pd.merge(clean_accident_data_df_rename, population_id_df, on= "county", how='left')
# clean_accident_data_df_rename.head()

In [None]:
clean_accident_data_df_rename = clean_accident_data_df_rename.sort_values(by='datetime')
clean_accident_data_df_rename.head()

In [None]:
clean_accident_data_df_rename = clean_accident_data_df_rename[['id', 'datetime', 'state_name', 'city', 'county', 'zipcode', 'timezone', 'temperature', 'visibility', 'weather_condition']]
clean_accident_data_df_rename.head()

In [None]:
clean_accident_data_df_rename.nunique()

In [None]:
clean_accident_data_df_rename.to_csv("Resources/accident.csv", index=False)

In [None]:
clean_accident_data_df_rename.tail()