### Loading data

In [1]:
import pandas as pd
# download the csv file from here: https://www.google.com/covid19/mobility/index.html?hl=en
df = pd.read_csv('Global_Mobility_Report.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [2]:
df.sample(10)

Unnamed: 0,country_region_code,country_region,sub_region_1,sub_region_2,iso_3166_2_code,census_fips_code,date,retail_and_recreation_percent_change_from_baseline,grocery_and_pharmacy_percent_change_from_baseline,parks_percent_change_from_baseline,transit_stations_percent_change_from_baseline,workplaces_percent_change_from_baseline,residential_percent_change_from_baseline
668626,US,United States,Virginia,Virginia Beach,,51810.0,2020-04-25,-41.0,-11.0,2.0,-31.0,-32.0,12.0
288131,US,United States,Alaska,Ketchikan Gateway,,2130.0,2020-07-13,,,,,-25.0,
696713,UY,Uruguay,,,,,2020-03-19,-56.0,-23.0,-67.0,-49.0,-27.0,17.0
440918,US,United States,Maryland,Somerset County,,24039.0,2020-05-16,,,,,-14.0,
106493,HN,Honduras,Lempira Department,,HN-LE,,2020-03-01,,,-11.0,,,
617539,US,United States,Texas,Camp County,,48063.0,2020-05-28,,,,,-21.0,
348427,US,United States,Georgia,Seminole County,,13253.0,2020-07-15,,,,,-22.0,
323157,US,United States,Florida,Gadsden County,,12039.0,2020-07-14,-12.0,-1.0,,-9.0,-31.0,11.0
605930,US,United States,Tennessee,Johnson County,,47091.0,2020-03-15,-1.0,39.0,,,,
569629,US,United States,Oklahoma,Stephens County,,40137.0,2020-07-11,-6.0,,,,-9.0,


### Cleaning steps

In [3]:
# We only focus on the trend in the U.S
df = df.loc[df['country_region']=="United States"]
# Drop the columns that we will not use for the analysis
df = df.drop(columns=['country_region_code','country_region','iso_3166_2_code', 'census_fips_code'])
# Rename the rest of the columns for easier accessing
df = df.rename(columns={'sub_region_1': 'state',
                        'sub_region_2': 'county',
                        'retail_and_recreation_percent_change_from_baseline': 'retail',
                        'grocery_and_pharmacy_percent_change_from_baseline': 'grocery',
                        'parks_percent_change_from_baseline': 'parks',
                        'transit_stations_percent_change_from_baseline': 'transit',
                        'workplaces_percent_change_from_baseline': 'workplaces',
                        'residential_percent_change_from_baseline': 'residential'})
# Drop the rows that contain any missing values from the state and county column
df = df.dropna(subset=['state', 'county'])

In [4]:
# convert the long format state name to short code in order to plot the map
state_codes = {
    'District of Columbia' : 'dc','Mississippi': 'MS', 'Oklahoma': 'OK', 
    'Delaware': 'DE', 'Minnesota': 'MN', 'Illinois': 'IL', 'Arkansas': 'AR', 
    'New Mexico': 'NM', 'Indiana': 'IN', 'Maryland': 'MD', 'Louisiana': 'LA', 
    'Idaho': 'ID', 'Wyoming': 'WY', 'Tennessee': 'TN', 'Arizona': 'AZ', 
    'Iowa': 'IA', 'Michigan': 'MI', 'Kansas': 'KS', 'Utah': 'UT', 
    'Virginia': 'VA', 'Oregon': 'OR', 'Connecticut': 'CT', 'Montana': 'MT', 
    'California': 'CA', 'Massachusetts': 'MA', 'West Virginia': 'WV', 
    'South Carolina': 'SC', 'New Hampshire': 'NH', 'Wisconsin': 'WI',
    'Vermont': 'VT', 'Georgia': 'GA', 'North Dakota': 'ND', 
    'Pennsylvania': 'PA', 'Florida': 'FL', 'Alaska': 'AK', 'Kentucky': 'KY', 
    'Hawaii': 'HI', 'Nebraska': 'NE', 'Missouri': 'MO', 'Ohio': 'OH', 
    'Alabama': 'AL', 'Rhode Island': 'RI', 'South Dakota': 'SD', 
    'Colorado': 'CO', 'New Jersey': 'NJ', 'Washington': 'WA', 
    'North Carolina': 'NC', 'New York': 'NY', 'Texas': 'TX', 
    'Nevada': 'NV', 'Maine': 'ME'}

df['state'] = df['state'].apply(lambda x : state_codes[x])

### Double check before exporting - everything looks fine

In [5]:
df.sample(10)

Unnamed: 0,state,county,date,retail,grocery,parks,transit,workplaces,residential
365703,IL,Lake County,2020-05-08,-38.0,-1.0,27.0,-41.0,-50.0,22.0
678195,WV,Kanawha County,2020-02-26,14.0,5.0,8.0,4.0,2.0,-1.0
286634,AL,Walker County,2020-05-15,-4.0,11.0,,-20.0,-22.0,8.0
595356,SD,Brown County,2020-04-22,-26.0,,,,-34.0,14.0
484476,MO,Dunklin County,2020-02-22,2.0,7.0,,,9.0,
537101,NC,Martin County,2020-02-15,-2.0,1.0,,,1.0,
374535,IN,Carroll County,2020-03-24,-36.0,,,,-38.0,
435489,ME,Aroostook County,2020-02-19,-4.0,-4.0,,,-12.0,3.0
303016,CA,Fresno County,2020-03-02,15.0,9.0,23.0,5.0,-3.0,0.0
358677,ID,Valley County,2020-04-26,,,,,-51.0,


### Exporting to the final csv file

In [6]:
df.to_csv('US_Mobility_Report.csv', index=False)

### Let's move on the COVID data

In [7]:
# Data can be downloaded from here: https://github.com/nytimes/covid-19-data
covid_df = pd.read_csv('us-states.csv')
covid_df.sample(10)

Unnamed: 0,date,state,fips,cases,deaths
5950,2020-06-18,Washington,53,28663,1246
1098,2020-03-22,New York,36,15188,142
6436,2020-06-27,Rhode Island,44,16661,927
4582,2020-05-25,Arkansas,5,6029,117
4715,2020-05-27,Missouri,29,12624,705
3218,2020-04-30,Illinois,17,52918,2361
4154,2020-05-17,Indiana,18,28419,1751
6043,2020-06-20,North Carolina,37,51640,1239
2718,2020-04-21,Florida,12,27861,866
934,2020-03-19,New Jersey,34,735,9


In [8]:
# convert the long format state name to short code in order to plot the map
covid_df['state'] = covid_df['state'].apply(lambda x : state_codes.get(x))
# remove the fips column
covid_df.drop(columns=['fips'], inplace=True)

In [9]:
covid_df.to_csv('covid_cases.csv', index=False)