# Cleaning and Preprocessing

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns
sns.set_style('whitegrid')

import statsmodels.formula.api as smf


The City of Toronto experienced a high rate of covid cases through the COVID pandemic. The rate of covid cases hit some neighbourhoods harder than others. As a result, the city made the census 2016 neighbourhood data available that provide insights into the demographics of the neighbhour that might make a neighbourhood more vulnerable. These indicators include education levels, income levels, multigenerational housing, crowded housing, immigrants, and visible minorities. 

The goal of this data project is to determine which, if any of those indicators can be used to predict the total rate of covid cases betweeen March 2020 and June 2021 and determine if some indicators make a neighbourhood more vulnerable. If successful, these data can be used to predict other rates of covid in other cities as the pandemic continues or could be used to predict vulnerability of neighbourhoods during a future pandemic of a contagious disease.

# Reading in the City of Toronto Data on Neighbourhood Demographics and the COVID case counts as of July 2021

Data was taken from https://www.toronto.ca/city-government/data-research-maps/neighbourhoods-communities/neighbourhood-profiles/

The data consisted of 2 data sets: 
1) _Case Data_ (cov_cases):  Total number of Positive Covid Cases in the city of Toronto, Canada, separated by the city's 140 neighbourhoods. 

2) _Neighbourhood Indicators_ (cov_ind): The 140 neighbourhoods represent very different demographics in terms of education, income, immigrants, crowded housing, multigenerational housing, and visible minorities, which is available from the 2016 Canada Census. The concern is that any or all of these variables may point to a vulnerability of specific groups of people in getting COVID. 

Data from these two datasets needed be sliced and then joined so that the neighbhour indicators could be used in a regression model to predict rates of covid cases/100 000 people.

The hypothesis for this project is that the neighbourhood indicators can be used to predict rate of positive COVID cases. If a model can be generated to predict covid cases, then it could be used for the future to quickly identify vulnerable populations for in other cities in Canada, should that demographic information be available. 

Below is the original data, with columns transposed so that the Indicator Value variable can be sliced to generate specific data for each of the following variables:
    
    Low Education (Low_Edu)
    Low Income (Low_Inc)
    Multigeneration Housing (MultiGen_House)
    Crowded Housing (Crowded_House)
    Immigrants (Immigrants)
    Visibile Minorities (Vis_Minorites)

In [9]:
cov_ind = pd.read_csv('CityofToronto_COVID-19_NeighbourhoodData.xlsx - Socio demographic indicators.csv')
cov_ind.head()

Unnamed: 0,Neighbourhood Number,Neighbourhood Name,Indicator,Indicator Value
0,1,West Humber-Clairville (1),Have a low education level (do not have any ce...,21%
1,1,West Humber-Clairville (1),Live in low income (after-tax Low Income Measu...,16%
2,1,West Humber-Clairville (1),Are immigrants,58%
3,1,West Humber-Clairville (1),Identify as racialized (visible minority),82%
4,1,West Humber-Clairville (1),Live in multigenerational housing (3 or more g...,20%


In [11]:
cov_ind_val = cov_ind.transpose()
cov_ind_val.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,830,831,832,833,834,835,836,837,838,839
Neighbourhood Number,1,1,1,1,1,1,2,2,2,2,...,139,139,139,139,140,140,140,140,140,140
Neighbourhood Name,West Humber-Clairville (1),West Humber-Clairville (1),West Humber-Clairville (1),West Humber-Clairville (1),West Humber-Clairville (1),West Humber-Clairville (1),Mount Olive-Silverstone-Jamestown (2),Mount Olive-Silverstone-Jamestown (2),Mount Olive-Silverstone-Jamestown (2),Mount Olive-Silverstone-Jamestown (2),...,Scarborough Village (139),Scarborough Village (139),Scarborough Village (139),Scarborough Village (139),Guildwood (140),Guildwood (140),Guildwood (140),Guildwood (140),Guildwood (140),Guildwood (140)
Indicator,Have a low education level (do not have any ce...,Live in low income (after-tax Low Income Measu...,Are immigrants,Identify as racialized (visible minority),Live in multigenerational housing (3 or more g...,Live in crowded housing (unsuitable housing),Have a low education level (do not have any ce...,Live in low income (after-tax Low Income Measu...,Are immigrants,Identify as racialized (visible minority),...,Are immigrants,Identify as racialized (visible minority),Live in multigenerational housing (3 or more g...,Live in crowded housing (unsuitable housing),Have a low education level (do not have any ce...,Live in low income (after-tax Low Income Measu...,Are immigrants,Identify as racialized (visible minority),Live in multigenerational housing (3 or more g...,Live in crowded housing (unsuitable housing)
Indicator Value,21%,16%,58%,82%,20%,17%,31%,28%,65%,87%,...,55%,73%,8%,22%,13%,8%,32%,30%,8%,4%


In [12]:
cov_cases = pd.read_csv('CityofToronto_COVID-19_NeighbourhoodData.xlsx - All Cases and Rates by Neighbou.csv')
cov_cases.head()

Unnamed: 0,Neighbourhood ID,Neighbourhood Name,"Rate per 100,000 people",Case Count
0,69.0,Blake-Jones,3313.058108,256
1,125.0,Ionview,4948.317572,675
2,89.0,Runnymede-Bloor West Village,1996.027805,201
3,14.0,Islington-City Centre West,5631.75253,2476
4,120.0,Clairlea-Birchmount,7908.390157,2134


In [14]:
cov_cases.index.names = [None]
cov_cases.head(30)

Unnamed: 0,Neighbourhood ID,Neighbourhood Name,"Rate per 100,000 people",Case Count
0,69.0,Blake-Jones,3313.058108,256
1,125.0,Ionview,4948.317572,675
2,89.0,Runnymede-Bloor West Village,1996.027805,201
3,14.0,Islington-City Centre West,5631.75253,2476
4,120.0,Clairlea-Birchmount,7908.390157,2134
5,44.0,Flemingdon Park,8991.018101,1972
6,53.0,Henry Farm,7835.654773,1232
7,96.0,Casa Loma,3309.628009,363
8,133.0,Centennial Scarborough,5059.122886,676
9,85.0,South Parkdale,6599.844386,1442


# Sorting by Neighbourhood ID 
This was done so that later two datasets could be merged on the NeighbourhoodID column. 

In [15]:
cov_cases = cov_cases.sort_values(by=['Neighbourhood ID'])
cov_cases.head()

Unnamed: 0,Neighbourhood ID,Neighbourhood Name,"Rate per 100,000 people",Case Count
31,1.0,West Humber-Clairville,12250.84054,4081
16,2.0,Mount Olive-Silverstone-Jamestown,12283.78952,4048
75,3.0,Thistletown-Beaumond Heights,11949.80695,1238
56,4.0,Rexdale-Kipling,8148.922025,858
49,5.0,Elms-Old Rexdale,10279.18782,972


# Renaming the column names
The original data was provided with column names that were ofte more than one word. Therefore, they were renamed to all be one-word names.

In [16]:
cov_cases.columns=["NeighbourhoodID","Neighbourhood_Name","Rate","Case_Count"]
cov_cases

Unnamed: 0,NeighbourhoodID,Neighbourhood_Name,Rate,Case_Count
31,1.0,West Humber-Clairville,12250.840540,4081
16,2.0,Mount Olive-Silverstone-Jamestown,12283.789520,4048
75,3.0,Thistletown-Beaumond Heights,11949.806950,1238
56,4.0,Rexdale-Kipling,8148.922025,858
49,5.0,Elms-Old Rexdale,10279.187820,972
...,...,...,...,...
73,137.0,Woburn,8632.326821,4617
18,138.0,Eglinton East,8210.396909,1870
37,139.0,Scarborough Village,11055.967470,1849
33,140.0,Guildwood,5344.358173,530


# Slicing the Data of Interest
This next part is a series of codes to slice the covid indicator data into their respective indicators. 
This was done in order to append them to the covid cases data file so that the neighbourhood indicators could be used in a model to predict case counts. After slices the rows were transposed into columns. 
A neighbourhood ID column was generated in order to merge each new indicator dataframe with the case counts data. The range function was used to do that.

In [56]:
# This creates a variable of all the indicator values only for all neighbourhoods
#cov_ind_val.iloc[3:4]
cov_ind_val = cov_ind_val.iloc[3:4]
cov_ind_val.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,830,831,832,833,834,835,836,837,838,839
Indicator Value,21%,16%,58%,82%,20%,17%,31%,28%,65%,87%,...,55%,73%,8%,22%,13%,8%,32%,30%,8%,4%


# Low Education

In [57]:
#Low Education Indicator
#Slicing to save a variable as Low Education Indicator
#cov_ind_val.iloc[:1,::6]
cov_ind_lo_edu = cov_ind_val.iloc[::1,::6]
cov_ind_lo_edu

Unnamed: 0,0,6,12,18,24,30,36,42,48,54,...,780,786,792,798,804,810,816,822,828,834
Indicator Value,21%,31%,25%,22%,24%,24%,19%,18%,13%,10%,...,17%,20%,12%,15%,17%,22%,20%,22%,23%,13%


In [58]:
cov_ind_lo_edu2 = cov_ind_lo_edu.transpose()
cov_ind_lo_edu2

Unnamed: 0,Indicator Value
0,21%
6,31%
12,25%
18,22%
24,24%
...,...
810,22%
816,20%
822,22%
828,23%


In [60]:
cov_ind_lo_edu2.columns=["Low_Edu"]
cov_ind_lo_edu2

Unnamed: 0,Low_Edu
0,21%
6,31%
12,25%
18,22%
24,24%
...,...
810,22%
816,20%
822,22%
828,23%


# Low Income

In [62]:
#Low Income Indicator
#Slicing to save a variable as Low Income Indicator
#cov_ind_val.iloc[::1,1::6]
cov_ind_lo_inc = cov_ind_val.iloc[::1,1::6]
cov_ind_lo_inc

Unnamed: 0,1,7,13,19,25,31,37,43,49,55,...,781,787,793,799,805,811,817,823,829,835
Indicator Value,16%,28%,18%,19%,23%,25%,15%,15%,11%,7%,...,10%,20%,6%,11%,24%,25%,27%,26%,32%,8%


In [113]:
cov_ind_lo_inc2 = cov_ind_lo_inc.transpose()
cov_ind_lo_inc2

Unnamed: 0,Indicator Value
1,16%
7,28%
13,18%
19,19%
25,23%
...,...
811,25%
817,27%
823,26%
829,32%


In [114]:
cov_ind_lo_inc2['NeighbourhoodID'] = range(1,141)
cov_ind_lo_inc2

Unnamed: 0,Indicator Value,NeighbourhoodID
1,16%,1
7,28%,2
13,18%,3
19,19%,4
25,23%,5
...,...,...
811,25%,136
817,27%,137
823,26%,138
829,32%,139


In [117]:
cov_ind_lo_inc2.columns=["Low_Inc", "NeighbourhoodID"]
cov_ind_lo_inc2

Unnamed: 0,Low_Inc,NeighbourhoodID
1,16%,1
7,28%,2
13,18%,3
19,19%,4
25,23%,5
...,...,...
811,25%,136
817,27%,137
823,26%,138
829,32%,139


# Immigrants

In [71]:
#Immigrants
#Slicing to save a variable as Immigrants
#cov_ind_val.iloc[::1,2::6]
cov_ind_imm = cov_ind_val.iloc[::,2::6]
cov_ind_imm

Unnamed: 0,2,8,14,20,26,32,38,44,50,56,...,782,788,794,800,806,812,818,824,830,836
Indicator Value,58%,65%,54%,48%,50%,52%,45%,46%,39%,30%,...,56%,61%,37%,56%,55%,50%,57%,58%,55%,32%


In [72]:
cov_ind_imm2 = cov_ind_imm.transpose()
cov_ind_imm2

Unnamed: 0,Indicator Value
2,58%
8,65%
14,54%
20,48%
26,50%
...,...
812,50%
818,57%
824,58%
830,55%


In [74]:
cov_ind_imm2.columns=["Immigrants"]
cov_ind_imm2

Unnamed: 0,Immigrants
2,58%
8,65%
14,54%
20,48%
26,50%
...,...
812,50%
818,57%
824,58%
830,55%


In [75]:
cov_ind_imm2['NeighbourhoodID'] = range(1,141)
cov_ind_imm2

Unnamed: 0,Immigrants,NeighbourhoodID
2,58%,1
8,65%,2
14,54%,3
20,48%,4
26,50%,5
...,...,...
812,50%,136
818,57%,137
824,58%,138
830,55%,139


# Visible Minorities

In [76]:
#Visible Minorities
#Slicing to save a variable as Vis Min
#cov_ind_val.iloc[::1,3::6]
cov_ind_vis_min = cov_ind_val.iloc[::,3::6]
cov_ind_vis_min

Unnamed: 0,3,9,15,21,27,33,39,45,51,57,...,783,789,795,801,807,813,819,825,831,837
Indicator Value,82%,87%,64%,52%,67%,60%,40%,27%,21%,19%,...,81%,90%,44%,75%,75%,63%,76%,78%,73%,30%


In [77]:
cov_ind_vis_min2 = cov_ind_vis_min.transpose()
cov_ind_vis_min2

Unnamed: 0,Indicator Value
3,82%
9,87%
15,64%
21,52%
27,67%
...,...
813,63%
819,76%
825,78%
831,73%


In [78]:
cov_ind_vis_min2.columns=["Vis_Minorities"]
cov_ind_vis_min2

Unnamed: 0,Vis_Minorities
3,82%
9,87%
15,64%
21,52%
27,67%
...,...
813,63%
819,76%
825,78%
831,73%


In [79]:
cov_ind_vis_min2['NeighbourhoodID'] = range(1,141)
cov_ind_vis_min2

Unnamed: 0,Vis_Minorities,NeighbourhoodID
3,82%,1
9,87%,2
15,64%,3
21,52%,4
27,67%,5
...,...,...
813,63%,136
819,76%,137
825,78%,138
831,73%,139


# Multigenerational Housing

In [80]:
#Multigen Housing
#Slicing to save a variable as Multigen Housing
#cov_ind_val.iloc[::,4::6]
cov_ind_mgh = cov_ind_val.iloc[::,4::6]
cov_ind_mgh

Unnamed: 0,4,10,16,22,28,34,40,46,52,58,...,784,790,796,802,808,814,820,826,832,838
Indicator Value,20%,15%,15%,10%,13%,10%,9%,8%,6%,6%,...,22%,19%,11%,22%,13%,11%,14%,11%,8%,8%


In [82]:
cov_ind_mgh2 = cov_ind_mgh.transpose()
cov_ind_mgh2

Unnamed: 0,Indicator Value
4,20%
10,15%
16,15%
22,10%
28,13%
...,...
814,11%
820,14%
826,11%
832,8%


In [83]:
cov_ind_mgh2.columns=["MultiGen_House"]
cov_ind_mgh2

Unnamed: 0,MultiGen_House
4,20%
10,15%
16,15%
22,10%
28,13%
...,...
814,11%
820,14%
826,11%
832,8%


In [84]:
cov_ind_mgh2['NeighbourhoodID'] = range(1,141)
cov_ind_mgh2

Unnamed: 0,MultiGen_House,NeighbourhoodID
4,20%,1
10,15%,2
16,15%,3
22,10%,4
28,13%,5
...,...,...
814,11%,136
820,14%,137
826,11%,138
832,8%,139


# Crowded Housing

In [85]:
#Crowded Housing
#Slicing to save a variable as Crowded Housing
#cov_ind_val.iloc[::,5::6]
cov_ind_crh = cov_ind_val.iloc[::,5::6]
cov_ind_crh

Unnamed: 0,5,11,17,23,29,35,41,47,53,59,...,785,791,797,803,809,815,821,827,833,839
Indicator Value,17%,31%,16%,14%,18%,21%,11%,9%,8%,3%,...,11%,17%,3%,8%,17%,12%,19%,23%,22%,4%


In [86]:
cov_ind_crh2 = cov_ind_crh.transpose()
cov_ind_crh2

Unnamed: 0,Indicator Value
5,17%
11,31%
17,16%
23,14%
29,18%
...,...
815,12%
821,19%
827,23%
833,22%


In [87]:
cov_ind_crh2.columns=["Crowded_House"]
cov_ind_crh2

Unnamed: 0,Crowded_House
5,17%
11,31%
17,16%
23,14%
29,18%
...,...
815,12%
821,19%
827,23%
833,22%


In [88]:
cov_ind_crh2['NeighbourhoodID'] = range(1,141)
cov_ind_crh2

Unnamed: 0,Crowded_House,NeighbourhoodID
5,17%,1
11,31%,2
17,16%,3
23,14%,4
29,18%,5
...,...,...
815,12%,136
821,19%,137
827,23%,138
833,22%,139


In [89]:
cov_ind_lo_edu2 = cov_ind_lo_edu.transpose()
cov_ind_lo_edu2

Unnamed: 0,Indicator Value
0,21%
6,31%
12,25%
18,22%
24,24%
...,...
810,22%
816,20%
822,22%
828,23%


In [90]:
cov_ind_lo_edu2.columns=["Low_Edu"]
cov_ind_lo_edu2

Unnamed: 0,Low_Edu
0,21%
6,31%
12,25%
18,22%
24,24%
...,...
810,22%
816,20%
822,22%
828,23%


In [91]:
cov_ind_lo_edu2['NeighbourhoodID'] = range(1,141)
cov_ind_lo_edu2

Unnamed: 0,Low_Edu,NeighbourhoodID
0,21%,1
6,31%,2
12,25%,3
18,22%,4
24,24%,5
...,...,...
810,22%,136
816,20%,137
822,22%,138
828,23%,139


# Merging the Data
Each new covid indicator dataset (e.g., cov_ind_vis_min2) was separately merged (left_joined) 
#Note: I really need to find a for loop to do this more efficiently!

In [107]:
cov_cases.head()

Unnamed: 0,NeighbourhoodID,Neighbourhood_Name,Rate,Case_Count
31,1.0,West Humber-Clairville,12250.84054,4081
16,2.0,Mount Olive-Silverstone-Jamestown,12283.78952,4048
75,3.0,Thistletown-Beaumond Heights,11949.80695,1238
56,4.0,Rexdale-Kipling,8148.922025,858
49,5.0,Elms-Old Rexdale,10279.18782,972


In [120]:
left_joined = pd.merge(cov_cases,       # the "left" dataframe, this is the basic assumption
         cov_ind_lo_edu2,        # the "right" dataframe
        how = 'left',   # which observations to keep? Here we are specifying that we keep the "left" dataset
         on = 'NeighbourhoodID'  # the join key
        )
left_joined

Unnamed: 0,NeighbourhoodID,Neighbourhood_Name,Rate,Case_Count,Low_Edu
0,1.0,West Humber-Clairville,12250.840540,4081,21%
1,2.0,Mount Olive-Silverstone-Jamestown,12283.789520,4048,31%
2,3.0,Thistletown-Beaumond Heights,11949.806950,1238,25%
3,4.0,Rexdale-Kipling,8148.922025,858,22%
4,5.0,Elms-Old Rexdale,10279.187820,972,24%
...,...,...,...,...,...
136,137.0,Woburn,8632.326821,4617,20%
137,138.0,Eglinton East,8210.396909,1870,22%
138,139.0,Scarborough Village,11055.967470,1849,23%
139,140.0,Guildwood,5344.358173,530,13%


In [125]:
left_joined = pd.merge(left_joined,       # the "left" dataframe, this is the basic assumption
         #cov_ind_vis_min2,        # the "right" dataframe
         #cov_ind_lo_inc2,
         #cov_ind_mgh2,
         #cov_ind_crh2,
         #cov_ind_imm2,
        how = 'left',   # which observations to keep? Here we are specifying that we keep the "left" dataset
         on = 'NeighbourhoodID'  # the join key
        )
left_joined

Unnamed: 0,NeighbourhoodID,Neighbourhood_Name,Rate,Case_Count,Low_Edu,Low_Inc,MultiGen_House,Crowded_House,Immigrants,Vis_Minorities
0,1.0,West Humber-Clairville,12250.840540,4081,21%,16%,20%,17%,58%,82%
1,2.0,Mount Olive-Silverstone-Jamestown,12283.789520,4048,31%,28%,15%,31%,65%,87%
2,3.0,Thistletown-Beaumond Heights,11949.806950,1238,25%,18%,15%,16%,54%,64%
3,4.0,Rexdale-Kipling,8148.922025,858,22%,19%,10%,14%,48%,52%
4,5.0,Elms-Old Rexdale,10279.187820,972,24%,23%,13%,18%,50%,67%
...,...,...,...,...,...,...,...,...,...,...
136,137.0,Woburn,8632.326821,4617,20%,27%,14%,19%,57%,76%
137,138.0,Eglinton East,8210.396909,1870,22%,26%,11%,23%,58%,78%
138,139.0,Scarborough Village,11055.967470,1849,23%,32%,8%,22%,55%,73%
139,140.0,Guildwood,5344.358173,530,13%,8%,8%,4%,32%,30%


# Merged Data
This is the new merged data including the rates, case counts, and all of the niehgbourhood indicators.

In [137]:
left_joined2 = left_joined.copy()
left_joined2.head()

Unnamed: 0,NeighbourhoodID,Neighbourhood_Name,Rate,Case_Count,Low_Edu,Low_Inc,MultiGen_House,Crowded_House,Immigrants,Vis_Minorities
0,1.0,West Humber-Clairville,12250.84054,4081,21%,16%,20%,17%,58%,82%
1,2.0,Mount Olive-Silverstone-Jamestown,12283.78952,4048,31%,28%,15%,31%,65%,87%
2,3.0,Thistletown-Beaumond Heights,11949.80695,1238,25%,18%,15%,16%,54%,64%
3,4.0,Rexdale-Kipling,8148.922025,858,22%,19%,10%,14%,48%,52%
4,5.0,Elms-Old Rexdale,10279.18782,972,24%,23%,13%,18%,50%,67%


# More Data Cleaning Needed 

## Converting Objects to Integers
There were problems when exploring the data so I came back and realized that the data were not integers. So I had to convert them. Then the stats worked. 

In [148]:
print(left_joined.dtypes)

NeighbourhoodID       float64
Neighbourhood_Name     object
Rate                  float64
Case_Count              int64
Low_Edu                object
Low_Inc                object
MultiGen_House         object
Crowded_House          object
Immigrants             object
Vis_Minorities         object
dtype: object


In [154]:
# covert string to an integer
left_joined2['Low_Edu'].str[:1]
print(left_joined2.dtypes) 
left_joined2.head()
# show the dataframe
#print("-"*25)

NeighbourhoodID       float64
Neighbourhood_Name     object
Rate                  float64
Case_Count              int64
Low_Edu                object
Low_Inc                object
MultiGen_House         object
Crowded_House          object
Immigrants             object
Vis_Minorities         object
dtype: object


Unnamed: 0,NeighbourhoodID,Neighbourhood_Name,Rate,Case_Count,Low_Edu,Low_Inc,MultiGen_House,Crowded_House,Immigrants,Vis_Minorities
0,1.0,West Humber-Clairville,12250.84054,4081,21%,16%,20%,17%,58%,82%
1,2.0,Mount Olive-Silverstone-Jamestown,12283.78952,4048,31%,28%,15%,31%,65%,87%
2,3.0,Thistletown-Beaumond Heights,11949.80695,1238,25%,18%,15%,16%,54%,64%
3,4.0,Rexdale-Kipling,8148.922025,858,22%,19%,10%,14%,48%,52%
4,5.0,Elms-Old Rexdale,10279.18782,972,24%,23%,13%,18%,50%,67%


## Dropping a final Row 
I discovered that the final row in the dataset was NaN and should not be included, as there are only 140 neighbourhoods. So I dropped it.

In [158]:
left_joined.head(141)

Unnamed: 0,NeighbourhoodID,Neighbourhood_Name,Rate,Case_Count,Low_Edu,Low_Inc,MultiGen_House,Crowded_House,Immigrants,Vis_Minorities
0,1.0,West Humber-Clairville,12250.840540,4081,21%,16%,20%,17%,58%,82%
1,2.0,Mount Olive-Silverstone-Jamestown,12283.789520,4048,31%,28%,15%,31%,65%,87%
2,3.0,Thistletown-Beaumond Heights,11949.806950,1238,25%,18%,15%,16%,54%,64%
3,4.0,Rexdale-Kipling,8148.922025,858,22%,19%,10%,14%,48%,52%
4,5.0,Elms-Old Rexdale,10279.187820,972,24%,23%,13%,18%,50%,67%
...,...,...,...,...,...,...,...,...,...,...
136,137.0,Woburn,8632.326821,4617,20%,27%,14%,19%,57%,76%
137,138.0,Eglinton East,8210.396909,1870,22%,26%,11%,23%,58%,78%
138,139.0,Scarborough Village,11055.967470,1849,23%,32%,8%,22%,55%,73%
139,140.0,Guildwood,5344.358173,530,13%,8%,8%,4%,32%,30%


In [160]:
left_joined.drop(left_joined.index[140],inplace=True)
left_joined.head(141)

Unnamed: 0,NeighbourhoodID,Neighbourhood_Name,Rate,Case_Count,Low_Edu,Low_Inc,MultiGen_House,Crowded_House,Immigrants,Vis_Minorities
0,1.0,West Humber-Clairville,12250.840540,4081,21%,16%,20%,17%,58%,82%
1,2.0,Mount Olive-Silverstone-Jamestown,12283.789520,4048,31%,28%,15%,31%,65%,87%
2,3.0,Thistletown-Beaumond Heights,11949.806950,1238,25%,18%,15%,16%,54%,64%
3,4.0,Rexdale-Kipling,8148.922025,858,22%,19%,10%,14%,48%,52%
4,5.0,Elms-Old Rexdale,10279.187820,972,24%,23%,13%,18%,50%,67%
...,...,...,...,...,...,...,...,...,...,...
135,136.0,West Hill,6929.030374,1898,22%,25%,11%,12%,50%,63%
136,137.0,Woburn,8632.326821,4617,20%,27%,14%,19%,57%,76%
137,138.0,Eglinton East,8210.396909,1870,22%,26%,11%,23%,58%,78%
138,139.0,Scarborough Village,11055.967470,1849,23%,32%,8%,22%,55%,73%


## Removing the % signs

In [None]:
#this was done to remove the percentage signs
cov_data.replace('%', '', regex=True, inplace=True) #end
cov_data.values

# Saving as New Datafile
The new cleaner data was exported to the new file to a CSV.
Note: there is something wrong with this command because I always have to go and change the file name.

In [132]:
left_joined.to_csv (r'C:\Desktop\Brainstation\Data Science\Project\covid_dataframe.csv', index = False, header=True)

# Reading Back In
This was done to confirm that the data were saved properly

In [18]:
cov_data = pd.read_csv('covid_dataframe.csv')
cov_data.head()

Unnamed: 0,NeighbourhoodID,Neighbourhood_Name,Rate,Case_Count,Low_Edu,Low_Inc,MultiGen_House,Crowded_House,Immigrants,Vis_Minorities
0,1.0,West Humber-Clairville,12250.84054,4081,21,16,20,17,58,82
1,2.0,Mount Olive-Silverstone-Jamestown,12283.78952,4048,31,28,15,31,65,87
2,3.0,Thistletown-Beaumond Heights,11949.80695,1238,25,18,15,16,54,64
3,4.0,Rexdale-Kipling,8148.922025,858,22,19,10,14,48,52
4,5.0,Elms-Old Rexdale,10279.18782,972,24,23,13,18,50,67


In [19]:
cov_data = pd.DataFrame(cov_data)
cov_data

Unnamed: 0,NeighbourhoodID,Neighbourhood_Name,Rate,Case_Count,Low_Edu,Low_Inc,MultiGen_House,Crowded_House,Immigrants,Vis_Minorities
0,1.0,West Humber-Clairville,12250.840540,4081,21,16,20,17,58,82
1,2.0,Mount Olive-Silverstone-Jamestown,12283.789520,4048,31,28,15,31,65,87
2,3.0,Thistletown-Beaumond Heights,11949.806950,1238,25,18,15,16,54,64
3,4.0,Rexdale-Kipling,8148.922025,858,22,19,10,14,48,52
4,5.0,Elms-Old Rexdale,10279.187820,972,24,23,13,18,50,67
...,...,...,...,...,...,...,...,...,...,...
135,136.0,West Hill,6929.030374,1898,22,25,11,12,50,63
136,137.0,Woburn,8632.326821,4617,20,27,14,19,57,76
137,138.0,Eglinton East,8210.396909,1870,22,26,11,23,58,78
138,139.0,Scarborough Village,11055.967470,1849,23,32,8,22,55,73


In [20]:
cov_data.dtypes

NeighbourhoodID       float64
Neighbourhood_Name     object
Rate                  float64
Case_Count              int64
Low_Edu                 int64
Low_Inc                 int64
MultiGen_House          int64
Crowded_House           int64
Immigrants              int64
Vis_Minorities          int64
dtype: object

# The Data Look Good!
imho ;)