In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# DATA CLEANING FOR CASES, DEATHS AND RECOVERED CASES  - MODEL 1 AND 2

In [2]:
# DATASET CONTAINS NUMBER OF CONFIRMED CASES, DEATHS AND RECOVERED CASES AS OF 3/22/20 FOR ALL COUNTRIES/REGIONS. 
df = pd.read_csv('../csv-files/CASES.csv')
df.head()

Unnamed: 0,FIPS,Admin2,Province_State,Country_Region,Last_Update,Lat,Long_,Confirmed,Deaths,Recovered,Active,Combined_Key
0,45001.0,Abbeville,South Carolina,US,2020-03-27 22:14:55,34.223334,-82.461707,4,0,0,0,"Abbeville, South Carolina, US"
1,22001.0,Acadia,Louisiana,US,2020-03-27 22:14:55,30.295065,-92.414197,8,1,0,0,"Acadia, Louisiana, US"
2,51001.0,Accomack,Virginia,US,2020-03-27 22:14:55,37.767072,-75.632346,2,0,0,0,"Accomack, Virginia, US"
3,16001.0,Ada,Idaho,US,2020-03-27 22:14:55,43.452658,-116.241552,54,0,0,0,"Ada, Idaho, US"
4,19001.0,Adair,Iowa,US,2020-03-27 22:14:55,41.330756,-94.471059,1,0,0,0,"Adair, Iowa, US"


In [3]:
cases = df[["Country_Region", "Confirmed", "Deaths", "Recovered"]]
cases = cases.rename(columns={"Country_Region":"COUNTRY", "Confirmed":"CONFIRMED CASES", 
                              "Deaths":"DEATHS", "Recovered":"RECOVERED CASES"})
cases.head()

Unnamed: 0,COUNTRY,CONFIRMED CASES,DEATHS,RECOVERED CASES
0,US,4,0,0
1,US,8,1,0
2,US,2,0,0
3,US,54,0,0
4,US,1,0,0


In [4]:
# GROUP BY COUNTRY AND COUNT THE TOTAL AMOUNT OF CASES IN EACH COUNTRY
casesFinal = cases.groupby(['COUNTRY']).sum()
allcases = casesFinal.fillna(0)
allcases

Unnamed: 0_level_0,CONFIRMED CASES,DEATHS,RECOVERED CASES
COUNTRY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Afghanistan,110,4,2
Albania,186,8,31
Algeria,409,26,29
Andorra,267,3,1
Angola,4,0,0
...,...,...,...
Venezuela,107,1,31
Vietnam,163,0,20
West Bank and Gaza,91,1,17
Zambia,22,0,0


In [5]:
# SAVE DATA AS CSV FILE FOR TABLEAU CHARTS
allcases.to_csv(r'../csv-files/output_data/cases.csv')

# CONFIRMED CASES AND DEATHS - MODEL 1

In [6]:
# MODEL TO ANALYZE RELATIONSHIP BETWEEN THE NUMBER OF CONFIRMED CASES 
# AND THE TOTAL NUMBER OF DEATHS CASES PER COUNTRY

In [7]:
X = allcases["CONFIRMED CASES"].values.reshape(-1, 1)
y = allcases["DEATHS"].values.reshape(-1, 1)

print("Shape: ", X.shape, y.shape)

('Shape: ', (176, 1), (176, 1))


In [8]:
# Split the data into training and testing
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [9]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data
X_scaler = StandardScaler().fit(X_train)
y_scaler = StandardScaler().fit(y_train)



In [10]:
# Transform the training and testing data using the X_scaler and y_scaler models
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)
y_train_scaled = y_scaler.transform(y_train)
y_test_scaled = y_scaler.transform(y_test)

In [11]:
# Create a LinearRegression model and fit it to the scaled training data
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X_train_scaled, y_train_scaled)

training_score = model.score(X_train_scaled, y_train_scaled)
training_score

0.6529045690174721

# CONFIRMED CASES AND RECOVERED CASES - MODEL 2

In [12]:
# MODEL TO ANALYZE RELATIONSHIP BETWEEN THE NUMBER OF CONFIRMED CASES 
# AND THE TOTAL NUMBER OF RECOVERED CASES PER COUNTRY

In [13]:
# Assign the data to X and y

X1 = allcases["CONFIRMED CASES"].values.reshape(-1, 1)
y1 = allcases["RECOVERED CASES"].values.reshape(-1, 1)

print("Shape: ", X1.shape, y1.shape)

('Shape: ', (176, 1), (176, 1))


In [14]:
# Split the data into training and testing
from sklearn.model_selection import train_test_split
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, random_state=42)

In [15]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data
X1_scaler = StandardScaler().fit(X1_train)
y1_scaler = StandardScaler().fit(y1_train)

In [16]:
# Transform the training and testing data using the X_scaler and y_scaler models
X1_train_scaled = X1_scaler.transform(X1_train)
X1_test_scaled = X1_scaler.transform(X1_test)
y1_train_scaled = y1_scaler.transform(y1_train)
y1_test_scaled = y1_scaler.transform(y1_test)

In [17]:
# Create a LinearRegression model and fit it to the scaled training data
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X1_train_scaled, y1_train_scaled)

training_score = model.score(X1_train_scaled, y1_train_scaled)
training_score

0.37253223124822454

# DATA CLEANING FOR GENDER COMPARISON - MODELS 3 AND 4

In [18]:
# FOR MODELS 3 AND 4 WE ARE COMPARING THE TOTAL FEMALE/MALE DEATHS TO THE TOTAL NUMBER OF DEATHS 
# FOR COUNTRIES CHINA, ITALY, GERMANY, IRAN, AND FRANCE

In [19]:
# Create a DataFrame of frames using a dictionary of lists
gender_df = pd.DataFrame({
    "COUNTRY": [1, 2, 3, 4, 5, 6],
    "COUNTRY_NAME": ["China", "France", "Germany", "Iran", "Italy", "Spain"],
    "TOTAL_DEATHS": [3296, 1997, 351, 2378, 9134, 5138],
    "MALE_DEATHS": [2109, 1158, 232, 1403, 6485, 3340],
    "FEMALE_DEATHS": [1187, 839, 119, 975, 2649, 1798]
})
gender_df

Unnamed: 0,COUNTRY,COUNTRY_NAME,FEMALE_DEATHS,MALE_DEATHS,TOTAL_DEATHS
0,1,China,1187,2109,3296
1,2,France,839,1158,1997
2,3,Germany,119,232,351
3,4,Iran,975,1403,2378
4,5,Italy,2649,6485,9134
5,6,Spain,1798,3340,5138


In [20]:
gender_df.to_csv(r'../csv-files/output_data/gender.csv')

# FEMALE DEATHS AND TOTAL DEATHS - MODEL 3

In [21]:
# Assign the data to X and y

X2 = gender_df["FEMALE_DEATHS"].values.reshape(-1, 1)
y2 = gender_df["TOTAL_DEATHS"].values.reshape(-1, 1)

print("Shape: ", X1.shape, y1.shape)

('Shape: ', (176, 1), (176, 1))


In [22]:
# Split the data into training and testing
from sklearn.model_selection import train_test_split
X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, random_state=42)

In [23]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data
X2_scaler = StandardScaler().fit(X2_train)
y2_scaler = StandardScaler().fit(y2_train)

In [24]:
# Transform the training and testing data using the X_scaler and y_scaler models
X2_train_scaled = X2_scaler.transform(X2_train)
X2_test_scaled = X2_scaler.transform(X2_test)
y2_train_scaled = y2_scaler.transform(y2_train)
y2_test_scaled = y2_scaler.transform(y2_test)

In [25]:
# Create a LinearRegression model and fit it to the scaled training data
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X2_train_scaled, y2_train_scaled)

training_score = model.score(X2_train_scaled, y2_train_scaled)
training_score

0.9771478661316445

# MALE DEATHS AND TOTAL DEATHS - MODEL 4

In [26]:
# Assign the data to X and y

X3 = gender_df["MALE_DEATHS"].values.reshape(-1, 1)
y3 = gender_df["TOTAL_DEATHS"].values.reshape(-1, 1)

print("Shape: ", X1.shape, y1.shape)

('Shape: ', (176, 1), (176, 1))


In [27]:
# Split the data into training and testing
from sklearn.model_selection import train_test_split
X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, random_state=42)

In [28]:
from sklearn.preprocessing import StandardScaler

# Create a StandardScater model and fit it to the training data
X3_scaler = StandardScaler().fit(X3_train)
y3_scaler = StandardScaler().fit(y3_train)

In [29]:
# Transform the training and testing data using the X_scaler and y_scaler models
X3_train_scaled = X3_scaler.transform(X3_train)
X3_test_scaled = X3_scaler.transform(X3_test)
y3_train_scaled = y3_scaler.transform(y3_train)
y3_test_scaled = y3_scaler.transform(y3_test)

In [30]:
# Create a LinearRegression model and fit it to the scaled training data
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(X3_train_scaled, y3_train_scaled)

training_score = model.score(X3_train_scaled, y3_train_scaled)
training_score

0.9963894691655624