In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
%cd /content/drive/Shared\ drives/Covid-19/

In [0]:
import datetime
import time
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing
from xgboost import plot_importance, plot_tree

---making the hospital df below

In [0]:
df_hosp=pd.read_csv('datasets/us_states_covid19_daily.csv')
df_hosp
df_abbrev=pd.read_csv('datasets/states2abbrev.csv')
df_abbrev
map_abbrev={}
for  index in range(df_abbrev.shape[0]):
  temp = df_abbrev.iloc[index]
  map_abbrev[temp['ABBREVIATION']]=temp[' NAME']
for  index in range(df_hosp.shape[0]):
  temp=df_hosp.iloc[index]
  df_hosp.iloc[index]['state']=map_abbrev[df_hosp.iloc[index]['state']]
  df_hosp.loc[index, 'state'] = map_abbrev[df_hosp.iloc[index]['state']]
  newdate=str(df_hosp.iloc[index]['date'])
  newdate=newdate[0:4]+"-"+newdate[4:6]+"-"+newdate[6:8]
  df_hosp.loc[index, 'date']=newdate
df_hosp.to_csv('datasets/df_hosp.csv', index=None)

---making the enriched df below

In [0]:
df=pd.read_csv('datasets/train.csv')
df['Date'] = pd.to_datetime(df['Date'])
train_last_date = df.Date.unique()[-1]
print(f"Dataset has training data untill : {train_last_date}")

In [0]:
wpop = pd.read_csv('datasets/WPP2019_PopulationByAgeSex_Medium.csv')

country_mapper = {
'Iran (Islamic Republic of)' : "Iran",
'Bolivia (Plurinational State of)' : 'Bolivia',
'Brunei Darussalam' : 'Brunei',
'Congo' : 'Congo (Kinshasa)',
'Democratic Republic of the Congo' : "Congo (Brazzaville)",
"Côte d'Ivoire": "Cote d'Ivoire",
"Gambia" : "Gambia, The",
"Republic of Korea": "Korea, South",
"Republic of Moldova": "Moldova",
'Réunion' : "Reunion",
'Russian Federation' : "Russia",
'China, Taiwan Province of China' : "Taiwan*",
"United Republic of Tanzania": "Tanzania",
"Bahamas": "The Bahamas",
"Gambia": "The Gambia",
"United States of America (and dependencies)" : "US",
"Venezuela (Bolivarian Republic of)" : "Venezuela",
'Viet Nam' : "Vietnam"}

def rename_countries(x, country_dict):
    new_name = country_dict.get(x)
    if new_name is not None:
        #print(x, "-->", new_name)
        return new_name
    else:
        return x

wpop = wpop[wpop['Time']==2020].reset_index(drop=True)
wpop['Location'] = wpop.Location.apply(lambda x : rename_countries(x, country_mapper))
clean_wpop = wpop[wpop['Location'].isin(df['Country_Region'].unique())].reset_index()

population_distribution = []
for country, gpdf in clean_wpop.groupby("Location"):
    aux = {f"age_{age_grp}": tot for age_grp, tot in zip(gpdf.AgeGrp, gpdf.PopTotal)}
    aux["Country_Region"] = country
    population_distribution.append(aux)
    
df_pop_distrib = pd.DataFrame(population_distribution)

# add missing countries with median values
no_data = []
for country in df['Country_Region'].unique():
    if country not in df_pop_distrib['Country_Region'].unique():
        aux = df_pop_distrib.drop('Country_Region', axis=1).median(axis=0).to_dict()
        aux["Country_Region"] = country
        no_data.append(aux)
df_no_data = pd.DataFrame(no_data)

df_pop_distrib = pd.concat([df_pop_distrib, df_no_data], axis=0)

# normalize features
norm_pop_distrib = df_pop_distrib.drop("Country_Region", axis=1).div(df_pop_distrib.drop("Country_Region", axis=1).sum(axis=1), axis=0)
norm_pop_distrib['total_pop'] = df_pop_distrib.drop("Country_Region", axis=1).sum(axis=1)
norm_pop_distrib["Country_Region"] = df_pop_distrib["Country_Region"]

del df_pop_distrib
del df_no_data
del clean_wpop
del wpop

df = df.merge(norm_pop_distrib, on="Country_Region", how='left')

In [0]:
smokers = pd.read_csv('datasets/share-of-adults-who-smoke.csv')
smokers = smokers[smokers.Year == 2016].reset_index(drop=True)

smokers_country_dict = {'North America' : "US",
 'Gambia' : "The Gambia",
 'Bahamas': "The Bahamas",
 "'South Korea'" : "Korea, South",
'Papua New Guinea' : "Guinea",
 "'Czech Republic'" : "Czechia",
 'Congo' : "Congo (Brazzaville)"}

smokers['Entity'] = smokers.Entity.apply(lambda x : rename_countries(x, smokers_country_dict))

no_datas_smoker = []
for country in df['Country_Region'].unique():
    if country not in smokers.Entity.unique():
        mean_score = smokers[['Smoking prevalence, total (ages 15+) (% of adults)']].mean().to_dict()
        mean_score['Entity'] = country
        no_datas_smoker.append(mean_score)
no_data_smoker_df = pd.DataFrame(no_datas_smoker)   
clean_smoke_data = pd.concat([smokers, no_data_smoker_df], axis=0)[['Entity','Smoking prevalence, total (ages 15+) (% of adults)']]
clean_smoke_data.rename(columns={"Entity": "Country_Region",
                                  "Smoking prevalence, total (ages 15+) (% of adults)" : "smokers_perc"}, inplace=True)

df = df.merge(clean_smoke_data, on="Country_Region", how='left')

In [0]:
def concat_country_province(country, province):
    if not isinstance(province, str):
        return country
    else:
        return country+"_"+province

# Concatenate region and province for training
df["Country_Region"] = df[["Country_Region", "Province_State"]].apply(lambda x : concat_country_province(x[0], x[1]), axis=1)

In [0]:
country_info = pd.read_csv('/content/drive/My Drive/covid-19-nachi/week4/countryinfo/covid19countryinfo.csv')
country_info = country_info[~country_info.country.isnull()].reset_index(drop=True)
country_info.drop([ c for c in country_info.columns if c.startswith("Unnamed")], axis=1, inplace=True)
country_info.drop(columns=['pop', 'sex0', 'sex14', 'sex25', 'sex54', 'sex64', 'sex65plus', 'medianage', "smokers", "sexratio"],
                  axis=1,
                  inplace=True)
# Columns with dates
country_info["quarantine"] = pd.to_datetime(country_info["quarantine"])
# country_info["restrictions"] = pd.to_datetime(country_info["restrictions"])
country_info["schools"] = pd.to_datetime(country_info["schools"])

same_state = []
for country in df["Province_State"].unique():
    if country in country_info.country.unique():
        same_state.append(country)
    else:
        pass
        # This part can help matching different external dataset and find corresponding countries
        #print(country)
        #matches = []
        #scores = []
        #if str(country)=="nan":
        #    continue
        #for possible_match in country_info.country.unique():
        #    matches.append(possible_match)
        #    scores.append(fuzz.partial_ratio(country, possible_match))
            
        #top_5_index = np.argsort(scores)[::-1][:5]
        #print(np.array(matches)[top_5_index])
        #print(np.array(scores)[top_5_index])
        #print("-------------------")
        
country_to_state_country = {}
for state in same_state:
    #print(state)
    #print(df[df["Province/State"]==state]["Country/Region"].unique())
    #print("----")
    country_to_state_country[state] = df[df["Province_State"]==state]["Country_Region"].unique()[0]+"_"+state

country_info['country'] = country_info.country.apply(lambda x : rename_countries(x, country_to_state_country))

coutry_merge_info = country_info[["country", "density", "urbanpop", "hospibed", "lung", "femalelung", "malelung"]]

cols_median = ["density", "urbanpop", "hospibed", "lung", "femalelung", "malelung"]
coutry_merge_info.loc[:, cols_median] = coutry_merge_info.loc[:, cols_median].apply(lambda x: x.fillna(x.median()),axis=0)


merged = df.merge(coutry_merge_info, left_on="Country_Region", right_on="country", how="left")
merged.loc[:, cols_median] = merged.loc[:, cols_median].apply(lambda x: x.fillna(x.median()),axis=0)

country_dates_info = country_info[["country", "quarantine", "schools"]]

def update_dates(a_df, col_update):
    """
    This creates a boolean time series with one after the start of confinements (different types : schools, restrictions or quarantine)
    """
    gpdf = a_df.groupby("Country_Region")
    new_col = gpdf.apply(lambda df : df[col_update].notnull().cumsum()).reset_index(drop=True)
    a_df[col_update] = new_col


for col in [ "quarantine", "schools"]:
    print(merged.shape)
    merged = merged.merge(country_dates_info[["country", col]],
                          left_on=["Country_Region", "Date"],
                          right_on=["country", col],
                          how="left",
                          )
    update_dates(merged, col)

drop_country_cols = [x for x in merged.columns if x.startswith("country_")]
merged.drop(columns=drop_country_cols, axis=1, inplace=True)

In [0]:
merged=merged[merged['Country_Region'].str.startswith('US_')]
merged

In [0]:
merged.to_csv('datasets/enriched_covid_19_week_418.csv', index=None)

-------making the df_merged below

In [0]:
df_train=pd.read_csv("datasets/train.csv")

In [0]:
df_US=df_train[df_train['Country_Region']=='US']

In [0]:
df_lockdown=pd.read_csv("datasets/USA-COVID19LockdownData.csv")

In [0]:
merged_df_columns = ['Date','Date_index', 'State','Confirmed', 'Deaths', 'Cumulative Confirmed', 'Cumulative Deaths', 'State of Emergency Declared', 'Stay at home ordered', 'Gatherings banned', 'Out-of-state Travel Restrictions', 'Schools closed', 'Daycares Closed', 'Bars and Restaurants Closed', 'Non-essential retails closed']
df_merged = pd.DataFrame( columns=merged_df_columns)
df_merged 
#rough work area below
string = "2020/03/03"
regexp = re.compile(r'[0-9]{4}/[0-9]{2}/[0-9]{2}')
if regexp.search(string):
  print ('matched')
else:
  print("not")
str1="0 March 26 adv"
str2=str1.split()[2]+"-"+(str1.split()[1])[0:3]
str2
strrt="asfd-vzcx"
strrt.split('-')[1]
# print(time.mktime(datetime.datetime.strptime(string.replace("-","/"), "%Y/%m/%d").timetuple()))

matched


'vzcx'

In [0]:
def getMonthNum(monthName):
  return{
        'Jan' : '01',
        'Feb' : '02',
        'Mar' : '03',
        'Apr' : '04',
        'May' : '05',
        'Jun' : '06',
        'Jul' : '07',
        'Aug' : '08',
        'Sep' : '09', 
        'Oct' : '10',
        'Nov' : '11',
        'Dec' : '12'
}[monthName]

i=0
oldState=""
firstDate=df_US.iloc[0]['Date'].replace("-","/")
firstDateTS = time.mktime(datetime.datetime.strptime(firstDate, "%Y/%m/%d").timetuple())
for index, row in df_US.iterrows():
    currState=row['Province_State']
    if(oldState != currState):
      oldConfirmedCases=0
      oldDeaths=0
    oldState = currState
    regexp = re.compile(r'[0-9]{4}/[0-9]{2}/[0-9]{2}')
    currDate=str(row['Date'])
    currDate=currDate.replace("-","/")
    currDateTS=time.mktime(datetime.datetime.strptime(currDate, "%Y/%m/%d").timetuple())
    # print(index)
    #
    emergencyDeclDate = str(df_lockdown[df_lockdown['State']==('\xa0'+row['Province_State'])]['State of Emergency Declared']).split()[1]
    if(emergencyDeclDate=="No information available"):
      emergencyDeclDate=0
      emergencyDeclDateTS=0
    elif("-" in emergencyDeclDate):
      emergencyDeclDate="2020-"+getMonthNum(emergencyDeclDate.split('-')[1])+"-"+emergencyDeclDate.split('-')[0]
      emergencyDeclDate=emergencyDeclDate.replace("-","/")
      emergencyDeclDateTS=time.mktime(datetime.datetime.strptime(emergencyDeclDate, "%Y/%m/%d").timetuple())
    #
    if(row['Province_State']=='Virgin Islands'):
      stayAtHomeOrderedDate='2020/03/23'
      stayAtHomeOrderedDateTS=time.mktime(datetime.datetime.strptime(stayAtHomeOrderedDate, "%Y/%m/%d").timetuple())
      emergencyDeclDate='2020/03/23'
      emergencyDeclDateTS=time.mktime(datetime.datetime.strptime(stayAtHomeOrderedDate, "%Y/%m/%d").timetuple())
    #
    stayAtHomeOrderedDate = str(df_lockdown[df_lockdown['State']==('\xa0'+row['Province_State'])]['Stay at home ordered']).split()[1]
    # if(row['Province_State']=='Guam'):
    #   print(currDate, row['Province_State'],emergencyDeclDate,stayAtHomeOrderedDate)
    if(stayAtHomeOrderedDate == "March" or stayAtHomeOrderedDate == "April"):
      stayAtHomeOrderedDate=str(df_lockdown[df_lockdown['State']==('\xa0'+row['Province_State'])]['Stay at home ordered']).split()[2]+"-"+stayAtHomeOrderedDate[0:3]
    if(stayAtHomeOrderedDate == "No" or stayAtHomeOrderedDate=="Partial" or stayAtHomeOrderedDate=="Regional"):
      stayAtHomeOrderedDate=0
    if(stayAtHomeOrderedDate!=0 and "-" in stayAtHomeOrderedDate):
      stayAtHomeOrderedDate="2020-"+getMonthNum(stayAtHomeOrderedDate.split('-')[1])+"-"+stayAtHomeOrderedDate.split('-')[0]
      stayAtHomeOrderedDate=stayAtHomeOrderedDate.replace("-","/")
      stayAtHomeOrderedDateTS=time.mktime(datetime.datetime.strptime(stayAtHomeOrderedDate, "%Y/%m/%d").timetuple())
    else:
      stayAtHomeOrderedDateTS=0
    #
    if(currDateTS<emergencyDeclDateTS):
      gatheringBan=0
      outOfState=0
      emergencyDeclDate=0
    else:
      emergencyDeclDate=1
      gatheringBan = str(df_lockdown[df_lockdown['State']==('\xa0'+row['Province_State'])]['Gatherings banned']).split()[1]
      if(gatheringBan=='All'):
        gatheringBan=1
      elif(gatheringBan=='6'):
        gatheringBan=2
      elif(gatheringBan=='10'):
        gatheringBan=3
      elif(gatheringBan=='11'):
        gatheringBan=4
      elif(gatheringBan=='25'):
        gatheringBan=10
      elif(gatheringBan=='50' or gatheringBan=='No'):
        gatheringBan=20
      else:
        continue
      outOfState = str(df_lockdown[df_lockdown['State']==('\xa0'+row['Province_State'])]['Out-of-state Travel Restrictions']).split()[1]
      if(outOfState=='Mandatory'):
        outOfState=1
      elif(outOfState=='Travel'):
        outOfState=2
      elif(outOfState=='Limited'):
        outOfState=3
      elif(outOfState=='Screened'):
        outOfState=4
      elif(outOfState=='No'):
        outOfState=20
    #
    if(currDateTS<stayAtHomeOrderedDateTS):
      schoolsClosed=1
      dayCareClosed=1
      barAndRestClosed=1
      nonEssRetClosed=1
      stayAtHomeOrderedDate=0
    else:  
      stayAtHomeOrderedDate=1
      schoolsClosed = str(df_lockdown[df_lockdown['State']==('\xa0'+row['Province_State'])]['Schools closed']).split()[1]
      if(schoolsClosed=="Yes"):
        schoolsClosed=0
      else:
        schoolsClosed=1
      
      dayCareClosed = str(df_lockdown[df_lockdown['State']==('\xa0'+row['Province_State'])]['Daycares Closed']).split()[1]
      if(dayCareClosed=="Yes"):
        dayCareClosed=0
      else:
        dayCareClosed=1
      
      barAndRestClosed = str(df_lockdown[df_lockdown['State']==('\xa0'+row['Province_State'])]['Bars and Restaurants Closed']).split()[1]
      if(barAndRestClosed=="Yes"):
        barAndRestClosed=0
      else:
        barAndRestClosed=1
      
      nonEssRetClosed = str(df_lockdown[df_lockdown['State']==('\xa0'+row['Province_State'])]['Non-essential retails closed']).split()[1]
      if(nonEssRetClosed=="Yes"):
        nonEssRetClosed=0
      else:
        nonEssRetClosed=1
    #
    newConfirmedCases = row['ConfirmedCases'] - oldConfirmedCases
    newDeaths = row['Fatalities'] - oldDeaths
    #
    currTS=time.mktime(datetime.datetime.strptime(currDate, "%Y/%m/%d").timetuple())
    if(currTS==firstDateTS):
      normalized_date=1
    else:
      normalized_date=round((currTS-firstDateTS-1)/86400)+1
    # normalized_date = currTS==firstDateTS ? 1.0 : (currTS-firstDateTS-1)/86400
    currDate=currDate.replace("/","-")
    individual_row_list=[currDate, normalized_date, row['Province_State'], newConfirmedCases, newDeaths, row['ConfirmedCases'], row['Fatalities'], emergencyDeclDate, stayAtHomeOrderedDate, gatheringBan, outOfState, schoolsClosed, dayCareClosed, barAndRestClosed, nonEssRetClosed]
    df_temp = pd.DataFrame( individual_row_list)

    df_merged.loc[i]=individual_row_list
    i=i+1
    oldConfirmedCases = row['ConfirmedCases']
    oldDeaths = row['Fatalities']
#
df_merged=df_merged.drop(df_merged[df_merged['State']=='Virgin Islands'].index)
# graph thingy practice below
# x = df_merged['Gatherings banned'].values.reshape(-1,1) #returns a numpy array
# min_max_scaler = preprocessing.MinMaxScaler()
# x_scaled = min_max_scaler.fit_transform(x)
# df_merged['Gatherings banned'] = pd.DataFrame(x_scaled)
# #
# x = df_merged['Out-of-state Travel Restrictions'].values.reshape(-1,1) #returns a numpy array
# min_max_scaler = preprocessing.MinMaxScaler()
# x_scaled = min_max_scaler.fit_transform(x)
# df_merged['Out-of-state Travel Restrictions'] = pd.DataFrame(x_scaled)

In [0]:
df_merged

Unnamed: 0,Date,Date_index,State,Confirmed,Deaths,Cumulative Confirmed,Cumulative Deaths,State of Emergency Declared,Stay at home ordered,Gatherings banned,Out-of-state Travel Restrictions,Schools closed,Daycares Closed,Bars and Restaurants Closed,Non-essential retails closed
0,2020-01-22,1,Alabama,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0
1,2020-01-23,2,Alabama,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0
2,2020-01-24,3,Alabama,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0
3,2020-01-25,4,Alabama,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0
4,2020-01-26,5,Alabama,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4667,2020-04-13,83,Wyoming,5.0,1.0,275.0,1.0,1,1,3,20,0,0,0,1
4668,2020-04-14,84,Wyoming,7.0,0.0,282.0,1.0,1,1,3,20,0,0,0,1
4669,2020-04-15,85,Wyoming,5.0,0.0,287.0,1.0,1,1,3,20,0,0,0,1
4670,2020-04-16,86,Wyoming,9.0,1.0,296.0,2.0,1,1,3,20,0,0,0,1


In [0]:
df_merged.to_csv('datasets/df_merged.csv', sep='\t', index=False)

---below is us states demographic df

In [0]:
df_all_demo=pd.read_csv('datasets/us-states-demographic.csv')
df_states=df_all_demo[df_all_demo['Country_Region']=='US']

In [0]:
df_states=df_states.drop(columns=['Country_Region','continent'])

In [0]:
df_states=df_states.rename(columns={'Province_State':'State', 'lat':'Lat', 'lon':'Long', 'population':'Population', 'area':'Area', 'density':'Density'})

In [0]:
df_states.reset_index(drop=True, inplace=True)

---making the final_dataset by merging df_merged, enriched and df_states

In [0]:
# --merging with enriched df..only for getting cumulative confirmed and deaths. Not necessary now. Redundant.
# df_merged=pd.read_csv('datasets/df_merged.csv', delimiter='\t')
# df_enriched=pd.read_csv('datasets/enriched_covid_19_week_418.csv')
# df_merged.set_index(['State','Date'],inplace=True)
# df_enriched = df_enriched.rename(columns = {'Province_State':'State', 'ConfirmedCases':'Cumulative Confirmed', 'Fatalities':'Cumulative Deaths'})
# df_enriched.set_index(['State','Date'],inplace=True)
# merged_dataset = pd.merge(df_merged, df_enriched, how='right', left_index=True, right_index=True)
# merged_dataset=merged_dataset.reset_index()
# merged_dataset=merged_dataset.drop(merged_dataset[merged_dataset['State']=='Virgin Islands'].index)
# merged_dataset=merged_dataset.drop(columns=['age_0-4', 'age_5-9',
#        'age_10-14', 'age_15-19', 'age_20-24', 'age_25-29', 'age_30-34',
#        'age_35-39', 'age_40-44', 'age_45-49', 'age_50-54', 'age_55-59',
#        'age_60-64', 'age_65-69', 'age_70-74', 'age_75-79', 'age_80-84',
#        'age_85-89', 'age_90-94', 'age_95-99', 'age_100+', 'total_pop',
#        'smokers_perc', 'density', 'urbanpop', 'hospibed', 'lung', 'femalelung',
#        'malelung', 'quarantine', 'country', 'schools', 'Id', 'Country_Region'])
# merged_dataset.columns


Index(['State', 'Date', 'Confirmed', 'Deaths', 'State of Emergency Declared',
       'Stay at home ordered', 'Gatherings banned',
       'Out-of-state Travel Restrictions', 'Schools closed', 'Daycares Closed',
       'Bars and Restaurants Closed', 'Non-essential retails closed',
       'Cumulative Confirmed', 'Cumulative Deaths'],
      dtype='object')

In [0]:
df_merged=pd.read_csv('datasets/df_merged.csv', delimiter='\t')
merged_dataset=pd.merge(df_merged, df_states, how='left', on='State')
merged_dataset=merged_dataset.drop(merged_dataset[merged_dataset['State']=='Virgin Islands'].index)

In [0]:
merged_dataset.to_csv('datasets/final_dataset.csv', sep='\t', index=False)

In [0]:
merged_dataset

Unnamed: 0,Date,Date_index,State,Confirmed,Deaths,Cumulative Confirmed,Cumulative Deaths,State of Emergency Declared,Stay at home ordered,Gatherings banned,Out-of-state Travel Restrictions,Schools closed,Daycares Closed,Bars and Restaurants Closed,Non-essential retails closed,Lat,Long,Population,Area,Density
0,2020-01-22,1,Alabama,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,32.3182,-86.9023,4903185,135767,36.11
1,2020-01-23,2,Alabama,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,32.3182,-86.9023,4903185,135767,36.11
2,2020-01-24,3,Alabama,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,32.3182,-86.9023,4903185,135767,36.11
3,2020-01-25,4,Alabama,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,32.3182,-86.9023,4903185,135767,36.11
4,2020-01-26,5,Alabama,0.0,0.0,0.0,0.0,0,1,0,0,0,0,0,0,32.3182,-86.9023,4903185,135767,36.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4606,2020-04-13,83,Wyoming,5.0,1.0,275.0,1.0,1,1,3,20,0,0,0,1,42.7560,-107.3025,578759,253335,2.28
4607,2020-04-14,84,Wyoming,7.0,0.0,282.0,1.0,1,1,3,20,0,0,0,1,42.7560,-107.3025,578759,253335,2.28
4608,2020-04-15,85,Wyoming,5.0,0.0,287.0,1.0,1,1,3,20,0,0,0,1,42.7560,-107.3025,578759,253335,2.28
4609,2020-04-16,86,Wyoming,9.0,1.0,296.0,2.0,1,1,3,20,0,0,0,1,42.7560,-107.3025,578759,253335,2.28


---below is the visualization stuff i was learning

In [0]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15,6))

# Day_num = 38 is March 1st
y1 = df_merged[(df_merged['State']=='California') ][['Confirmed']]
x1 = range(0, len(y1))
ax1.plot(x1, y1, 'bo--')
ax1.set_title("Spain ConfirmedCases between days 39 and 49")
ax1.set_xlabel("Days")
ax1.set_ylabel("ConfirmedCases")

y2 = all_data[(all_data['Country_Region']==country_dict['Spain']) & (all_data['Day_num']>39) & (all_data['Day_num']<=49)][['ConfirmedCases']].apply(lambda x: np.log(x))
x2 = range(0, len(y2))
ax2.plot(x2, y2, 'bo--')
ax2.set_title("Spain Log ConfirmedCases between days 39 and 49")
ax2.set_xlabel("Days")
ax2.set_ylabel("Log ConfirmedCases")

In [0]:
data1.shape

In [0]:
data1.head

In [0]:
data1.describe()

In [0]:
data1.columns

In [0]:
traindata=

In [0]:
print("Number of Country_Region : ", data1['Country/Region'])

In [0]:
data2=data1[data1['Country/Region']=='US']

In [0]:
data2

In [0]:
set(data2['Deaths'])

In [0]:
deaths_total = data2.groupby(['Date']).agg(['sum'])

In [0]:
deaths=data2[['Date', 'Deaths']]

In [0]:
confirmed_total=data2.groupby(['Date']).agg({'Confirmed':['sum']})
confirmed=data2[['Date','Confirmed']]

In [0]:
confirmed_total

In [0]:
total_date = deaths.join(confirmed)

In [0]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(17,7))

In [0]:
deaths.plot(ax=ax1)
ax1.set_title("Global death cases")
ax1.set_ylabel("Number of cases")
ax1.set_xlabel("Date")
confirmed.plot(ax=ax2, color='orange')
ax2.set_title("Global confirmed cases", size=13)
ax2.set_ylabel("Number of cases", size=13)
ax2.set_xlabel("Date", size=13)

In [0]:
fig

In [0]:
dataset.shape

In [0]:
dataset.head()

In [0]:
dataset.describe()

In [0]:
# set(dataset.Country_Region)
US_dataset = dataset[dataset['Country_Region']=='US']

In [0]:
US_dataset.head()

In [0]:
# set(US_dataset.Date)
set(US_dataset.ConfirmedCases)

In [0]:
US_dataset.shape

In [0]:
confirmed_cases=[]
confirmed_cases.append(0)
for i in range(US_dataset.shape[0]-1):
  confirmed_cases.append(US_dataset['ConfirmedCases'].iloc[i+1]-US_dataset['ConfirmedCases'].iloc[i])
len(confirmed_cases)

In [0]:
set(US_dataset['Province_State'])

In [0]:
dates= sorted(set(US_dataset['Date']))

In [0]:
import numpy as np

Y =[]
for i in dates:
  temp = sum(US_dataset[US_dataset['Date']==i]['ConfirmedCases'])
  if temp !=0:
    Y.append(temp)
Y = np.array(Y)


X=[]
for i in range(19):
  X.append(i)
X = np.array(X)




In [0]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X.reshape(-1,1),Y)
# regressor.predict()

In [0]:
from sklearn.preprocessing import PolynomialFeatures
X_poly = PolynomialFeatures(2).fit_transform(X.reshape(-1,1))

In [0]:
X_poly[:,2]

In [0]:
regressor.predict(X_poly[:2].reshape(-1,1))

In [0]:
import seaborn as sea

sea.scatterplot(X_poly[:2], regressor.predict(X_poly[:2].reshape(-1,1)), color='red')
sea.scatterplot(X,Y, color='blue')