# Data Clean Up 

In [66]:
# Import modules
import os
import pandas as pd

## Make library data set

In [76]:
# Set parameters
covid_path = "../Resources/Weekly_COVID-19_Cases__Tests__and_Deaths_by_ZIP_Code_pulled_272022.csv"
library_circulation_2019_path = "../Resources/Libraries_-_2019_Circulation_by_Location_pulled_272022.csv"
library_visitors_2019_path = "../Resources/Libraries_-_2019_Visitors_by_Location_pulled_272022.csv"
library_circulation_2020_path = "../Resources/Libraries_-_2020_Circulation_by_Location_pulled_272022.csv"
library_visitors_2020_path = "../Resources/Libraries_-_2020_Visitors_by_Location_pulled_272022.csv"
library_circulation_2021_path = "../Resources/Libraries_-_2021_Circulation_by_Location_pulled_272022.csv"
library_visitors_2021_path = "../Resources/Libraries_-_2021_Visitors_by_Location_pulled_272022.csv"

In [82]:
# Load datasets
covid_df = pd.read_csv(covid_path)
library_circulation_2019_df = pd.read_csv(library_circulation_2019_path, dtype={'ZIP': object})
library_visitors_2019_df = pd.read_csv(library_visitors_2019_path, dtype={'ZIP': object})
library_circulation_2020_df = pd.read_csv(library_circulation_2020_path, dtype={'ZIP': object})
library_visitors_2020_df = pd.read_csv(library_visitors_2020_path, dtype={'ZIP': object})
library_circulation_2021_df = pd.read_csv(library_circulation_2021_path, dtype={'ZIP': object})
library_visitors_2021_df = pd.read_csv(library_visitors_2021_path, dtype={'ZIP': object})

# covid_df.head()
# library_circulation_2019_df.head()
# library_visitors_2019_df.head()
# library_circulation_2020_df.head()
# library_visitors_2020_df.head()
# library_circulation_2021_df.head()
# library_visitors_2021_df.head()
# library_visitors_2021_df

In [83]:
# For library circulation we have data without zip code (online renewal, media download, etc.)
# We want to drop this data since it has no zip code associated with it
library_circulation_2019_df = library_circulation_2019_df.dropna(how='any')
library_circulation_2020_df = library_circulation_2020_df.dropna(how='any')
library_circulation_2021_df = library_circulation_2021_df.dropna(how='any')

In [84]:
# Align zip code column name
covid_df = covid_df.rename(columns={"ZIP Code": "ZIP"})
covid_df.head()

Unnamed: 0,ZIP,Week Number,Week Start,Week End,Cases - Weekly,Cases - Cumulative,Case Rate - Weekly,Case Rate - Cumulative,Tests - Weekly,Tests - Cumulative,...,Test Rate - Cumulative,Percent Tested Positive - Weekly,Percent Tested Positive - Cumulative,Deaths - Weekly,Deaths - Cumulative,Death Rate - Weekly,Death Rate - Cumulative,Population,Row ID,ZIP Code Location
0,60612,15,4/5/2020,4/11/2020,69.0,202.0,201.0,588.7,284.0,803,...,2340.4,0.3,0.3,3,7,8.7,20.4,34311,60612-2020-15,POINT (-87.687011 41.88004)
1,60612,16,4/12/2020,4/18/2020,66.0,268.0,192.0,781.1,286.0,1089,...,3173.9,0.3,0.3,3,10,8.7,29.1,34311,60612-2020-16,POINT (-87.687011 41.88004)
2,60604,4,1/23/2022,1/29/2022,8.0,322.0,1023.0,41176.5,246.0,11461,...,1465601.0,0.0,0.0,0,0,0.0,0.0,782,60604-2022-4,POINT (-87.629029 41.878153)
3,60615,24,6/7/2020,6/13/2020,12.0,377.0,29.0,907.1,538.0,3986,...,9590.3,0.0,0.1,0,15,0.0,36.1,41563,60615-2020-24,POINT (-87.602725 41.801993)
4,60615,25,6/14/2020,6/20/2020,13.0,390.0,31.0,938.3,551.0,4537,...,10916.0,0.0,0.1,1,16,2.4,38.5,41563,60615-2020-25,POINT (-87.602725 41.801993)


In [85]:
# Add type column
library_circulation_2019_df["Type"] = ["Circulation"] * len(library_circulation_2019_df)
library_visitors_2019_df["Type"] = ["Visitors"] * len(library_visitors_2019_df)
library_circulation_2020_df["Type"] = ["Circulation"] * len(library_circulation_2020_df)
library_visitors_2020_df["Type"] = ["Visitors"] * len(library_visitors_2020_df)
library_circulation_2021_df["Type"] = ["Circulation"] * len(library_circulation_2021_df)
library_visitors_2021_df["Type"] = ["Visitors"] * len(library_visitors_2021_df)

# Add year column
library_circulation_2019_df["Year"] = [2019] * len(library_circulation_2019_df)
library_visitors_2019_df["Year"] = [2019] * len(library_visitors_2019_df)
library_circulation_2020_df["Year"] = [2020] * len(library_circulation_2020_df)
library_visitors_2020_df["Year"] = [2020] * len(library_visitors_2020_df)
library_circulation_2021_df["Year"] = [2021] * len(library_circulation_2021_df)
library_visitors_2021_df["Year"] = [2021] * len(library_visitors_2021_df)

In [98]:
# Merge circulation and visitors together
merged_2019 = pd.merge(library_circulation_2019_df, library_visitors_2019_df, how = "outer")
merged_2020 = pd.merge(library_circulation_2020_df, library_visitors_2020_df, how = "outer")
merged_2021 = pd.merge(library_circulation_2021_df, library_visitors_2021_df, how = "outer")

# Merge all together
merged = pd.merge(merged_2019, merged_2020, how = "outer")
merged = pd.merge(merged, merged_2021, how = "outer")
merged.head()

Unnamed: 0,LOCATION,ADDRESS,CITY,ZIP,JANUARY,FEBRUARY,MARCH,APRIL,MAY,JUNE,...,AUGUST,SEPTEMBER,OCTOBER,NOVEMBER,DECEMBER,YTD,Type,Year,BRANCH,Location
0,Albany Park,3401 W. Foster Ave.,Chicago,60625,8214,7614,8460,7414,7044,7970,...,9069,7053,8012,6571,5924,93009,Circulation,2019,,
1,Altgeld,13281 S. Corliss Ave.,Chicago,60827,378,326,332,270,320,325,...,229,234,377,369,500,4014,Circulation,2019,,
2,Archer Heights,5055 S. Archer Ave.,Chicago,60632,5365,5019,5417,5349,4833,4627,...,4793,4537,5263,4051,3415,58025,Circulation,2019,,
3,Austin,5615 W. Race Ave.,Chicago,60644,215,590,706,736,806,983,...,1236,1193,1361,1286,1082,11328,Circulation,2019,,
4,Austin-Irving,6100 W. Irving Park Rd.,Chicago,60634,9678,9764,10845,10541,9763,11442,...,11063,9379,9932,8567,6492,119100,Circulation,2019,,


In [97]:
# Export data
merged_2019.to_csv("../Data/merged_2019.csv")
merged_2020.to_csv("../Data/merged_2020.csv")
merged_2021.to_csv("../Data/merged_2021.csv")
merged_2019.to_csv("../Data/merged_2021.csv")