#CMPT353 summer 2020
#SFU
#Course Project
#Author: Wei Yao (yaoweiy@sfu.ca) Yiran Zhang (yza363@sfu.ca)
#Dataset 1: Covid-19
#Resource Ref: https://www.kaggle.com/kimjihoo/coronavirusdataset
#Dataset 2: COVID19 Global Forecasting
#Resource Ref: https://www.kaggle.com/c/covid19-global-forecasting-week-5

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
import plotly.express as px
from sklearn.pipeline import make_pipeline
from sklearn import tree
from sklearn.metrics import classification_report
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import chi2
from sklearn import preprocessing
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics

# 1. Cases vs Time

In [None]:
time_count = pd.read_csv('Time.csv')

In [None]:
time_count.head()

In [None]:
time_count.shape


In [None]:
#drop null if any and delete the unnecessary column ['time']
time_count.dropna()
time_count.drop(columns=['time'], inplace=True)
time_count

## 1.1. Total Cases vs Confirmed

In [None]:
print(time_count.head())
plt.figure(figsize=(12, 4))
plt.xticks(rotation=25)
plt.title('COVID-19 Confirmed Cases  VS time in 2020 ')
plt.xlabel('Time')
plt.ylabel('Number of Confirmed Cases')
plt.plot(time_count['date'],time_count['confirmed'])
plt.show()

In [None]:
# The x-axis looks wried, so I tried to modify the data timestamp to give a clear view again
seaborn.set()
time_count['date']=pd.to_datetime(time_count['date'],format = '%Y-%m-%d')

plt.figure(figsize=(12, 4))
plt.xticks(rotation=25)
plt.title('COVID-19 Confirmed Cases  VS Time in 2020 ')
plt.xlabel('Time')
plt.ylabel('Number of Confirmed Cases')
plt.plot(time_count['date'],time_count['confirmed'])
plt.show()

In [None]:
# Add more details into the cases ( negative and total tested cases)
time_count['percentage'] =time_count['confirmed'] / time_count['test']
plt.figure(figsize=(12, 4))
plt.xticks(rotation=25)
plt.title('COVID-19 Confirmed Cases  VS time in 2020 ')
plt.xlabel('Time')
plt.ylabel('Number of Confirmed Cases')
plt.plot(time_count['date'],time_count['test']/100,'r.')
plt.plot(time_count['date'],time_count['negative']/100,'g.')
plt.plot(time_count['date'],time_count['confirmed'],'b.')
plt.legend(['Total Tested','Negative','Confirmed(Positive)'])
plt.show()

## 1.2. Percentage Ratio of Confirmed Cases Trends

In [None]:
#print(time_count)
plt.figure(figsize=(12, 4))
plt.title('COVID-19 Confirmed Cases in Percentage  VS time in 2020 ')
plt.xlabel('Time')
plt.ylabel('Percentage of Positive/Total Tested')
plt.plot(time_count['date'],time_count['percentage'])
plt.show()

## 1.3. Comparison of New Cases

In [None]:
color_list = ['#8DD3C7', '#FEFFB3', '#BFBBD9'
              , '#FA8174', '#81B1D2', '#FDB462'
              , '#B3DE69', '#BC82BD', '#CCEBC4']

def plot_lines(data, column_list, column_max, title):
    """
    FUNCTION
        to show many plots with combinations of lines with consistent colors and legend
        useful for plotting lines with different scales at once and then separately

    RETURN
        None
    """
    for i in column_list:
        fig, ax = plt.subplots(figsize=(13, 7))
        plt.title(f'{title}', fontsize=17)
        color_group = color_list[:-4][-(column_max-i):]
        for test_each, color_each in zip(data.columns[i:column_max], color_group):
            plt.plot(data.date, data[test_each]
                     , label=test_each, color=color_each
                    )
            label=data[test_each]
        ax.set_xticks(ax.get_xticks()[::int(len(data.date)/8)])
        plt.xlabel('Date', size=13)
        plt.ylabel('Number of cases', size=13)
        ax.legend(loc='upper left')
        plt.show()


for col in time_count.columns[2:7]:
    if col in [2, 4]:
        new_dict = {0: 1}
    else:
        new_dict = {0: 0}
    new_dict.update({ i : time_count[col][i] - time_count[col][i-1] for i in range(1, len(time_count)) })
    time_count[f'new_{col}'] = new_dict.values()

plot_lines(time_count, [7],12, 'Cumulative Cases')

# 2. Case vs Age

In [None]:
data = pd.read_csv('PatientInfo.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
# clean data: drop null if any and select necessary column ['sex','age']
data = data[data['country'] == 'Korea']
data = data[['sex', 'age']]
data = data.dropna().reset_index(drop=True)
data

## 2.1. Male and Female Case vs Age

In [None]:
age_count = pd.DataFrame(data['age'].value_counts())
age_count.reset_index(level=0, inplace = True)
age_count.columns = ['age', 'counts']
age_count

In [None]:
def str_to_int(age):
    for i in range(0,age.shape[0]):
        age[i] = int(age[i][:-1])

In [None]:
# set age as int
str_to_int(age_count['age'])

In [None]:
# sort age from 0 to 100
age_count = age_count.sort_values('age')
age_count

In [None]:
plt.figure(figsize=(12, 4))
plt.xticks(rotation=25)
plt.title('COVID-19 Confirmed Cases VS Age in 2020')
plt.xlabel('Age')
plt.ylabel('Number of Confirmed Cases')
plt.plot(age_count['age'],age_count['counts'])
plt.show()

## 2.2. Male Case vs Age

In [None]:
male_data = data[data['sex'] == 'male']
male_age_count = pd.DataFrame(male_data['age'].value_counts())
male_age_count.reset_index(level=0, inplace = True)
male_age_count.columns = ['age', 'counts']
male_age_count

In [None]:
# set age as int
str_to_int(male_age_count['age'])
# sort age from 0 to 90
male_age_count = male_age_count.sort_values('age')
male_age_count

In [None]:
plt.figure(figsize=(12, 4))
plt.xticks(rotation=25)
plt.title('COVID-19 Confirmed Male Cases VS Age in 2020')
plt.xlabel('Age')
plt.ylabel('Number of Confirmed Cases')
plt.plot(male_age_count['age'],male_age_count['counts'])
plt.show()

## 2.3. Female Case vs Age

In [None]:
female_data = data[data['sex'] == 'female']
female_age_count = pd.DataFrame(female_data['age'].value_counts())
female_age_count.reset_index(level=0, inplace = True)
female_age_count.columns = ['age', 'counts']
female_age_count

In [None]:
# set age as int
str_to_int(female_age_count['age'])
# sort age from 0 to 100
female_age_count = female_age_count.sort_values('age')
female_age_count

In [None]:
plt.figure(figsize=(12, 4))
plt.xticks(rotation=25)
plt.title('COVID-19 Confirmed Female Cases VS Age in 2020')
plt.xlabel('Age')
plt.ylabel('Number of Confirmed Cases')
plt.plot(female_age_count['age'],female_age_count['counts'])
plt.show()

## 2.4. Merge Into One Diagram

In [None]:
plt.figure(figsize=(12, 4))
plt.xticks(rotation=25)
plt.title('COVID-19 Confirmed Cases VS Age in 2020 ')
plt.xlabel('Age')
plt.ylabel('Number of Confirmed Cases')
plt.plot(age_count['age'],age_count['counts'],'r')
plt.plot(male_age_count['age'],male_age_count['counts'],'g')
plt.plot(female_age_count['age'],female_age_count['counts'],'b')
plt.legend(['Male & Female','Male','Female'])
plt.show()

## 2.5. The Proportion of Patients of All Ages was Compared

In [None]:
# sum the number of cases for three diff ways
mf_case = age_count['counts'].sum()
m_case = male_age_count['counts'].sum()
f_case = female_age_count['counts'].sum()

In [None]:
# both male and female
age_count['proportion'] = (age_count['counts'] / mf_case) * 100
age_count['proportion'] = age_count['proportion'].round(decimals=2)
# male
male_age_count['proportion'] = (age_count['counts'] / mf_case) * 100
male_age_count['proportion'] = age_count['proportion'].round(decimals=2)
# female
female_age_count['proportion'] = (age_count['counts'] / mf_case) * 100
female_age_count['proportion'] = age_count['proportion'].round(decimals=2)

In [None]:
plt.figure(figsize=(12, 4))
plt.xticks(rotation=25)
plt.title('COVID-19 Proportion of Patients VS Age in 2020 (Line)')
plt.xlabel('Age')
plt.ylabel('The Proportion of Patients')
plt.plot(age_count['age'],age_count['proportion'],'r')
plt.plot(male_age_count['age'],male_age_count['proportion'],'g--')
plt.plot(female_age_count['age'],female_age_count['proportion'],'b-.')
plt.legend(['Male & Female','Male','Female'])
plt.show()

In [None]:
plt.figure(figsize=(12, 4))
plt.xticks(rotation=25)
plt.title('COVID-19 Proportion of Patients VS Age in 2020 (Dot)')
plt.xlabel('Age')
plt.ylabel('The Proportion of Patients')
plt.plot(age_count['age'],age_count['proportion'],'r.')
plt.plot(male_age_count['age'],male_age_count['proportion'],'g.')
plt.plot(female_age_count['age'],female_age_count['proportion'],'b.')
plt.legend(['Male & Female','Male','Female'])
plt.show()

# 3. Case vs City

In [None]:
data = pd.read_csv('PatientInfo.csv')
data.head()

In [None]:
data.shape

In [None]:
# clean data: drop null if any and select necessary column ['sex','age']
data = data[data['country'] == 'Korea']
data = data[['province','city']]
data = data.dropna().reset_index(drop=True)
data

## 3.1. Cases vs Province

In [None]:
province_count = pd.DataFrame(data['province'].value_counts())
province_count.reset_index(level=0, inplace=True)
province_count.columns = ['province', 'counts']
province_count

In [None]:
fig = px.pie(province_count, values='counts', names='province')
fig.update_traces(textposition='inside')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig.show()

## 3.2 Cases vs City

In [None]:
city_count = pd.DataFrame(data['city'].value_counts())
city_count.reset_index(level=0, inplace = True)
city_count.columns = ['city', 'counts']
city_count

In [None]:
fig = px.pie(city_count, values='counts', names='city')
fig.update_traces(textposition='inside')
fig.update_layout(uniformtext_minsize=12, uniformtext_mode='hide')
fig.show()

## 3.3 Merge City and Province

In [None]:
city_province = data.drop_duplicates(subset='city',keep='first', inplace=False).reset_index(drop=True)
city_province = city_province[['province','city']]
city_province

In [None]:
# Merge the two dfs based on city column
city_count.set_index(['city'], inplace=True)
result = city_province.join(city_count,on='city')
result

In [None]:
fig = px.treemap(result, path=['province','city'], values='counts',
                  color='counts', hover_data=['city'],
                  color_continuous_scale='matter', title='Current COVID19 Confirmed Cases In South Korea')
fig.show()


# 4. Prediction Model

In [None]:
#load data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

In [None]:
#get the name of 'south korean'
country =train_data.Country_Region.unique()
print(country)

In [None]:
# filter to extract korean data
train_data=train_data[train_data['Country_Region']=='Korea, South']

# check head and drop unuseful columns
train=train_data.drop(['Id','County','Province_State','Country_Region'],axis=1)

# convert the target value to integer, confirmedCases to 1, Fatalities to 0
train['Target'].replace({'ConfirmedCases':1, 'Fatalities':0}, inplace=True)

train.head()

In [None]:
# filter to extract korean data
test_data=test_data[test_data['Country_Region']=='Korea, South']

# check head and drop unuseful columns
test=test_data.drop(['ForecastId','County','Province_State','Country_Region'],axis=1)

# convert the target value to integer, confirmedCases to 1, Fatalities to 0
test['Target'].replace({'ConfirmedCases':1, 'Fatalities':0}, inplace=True)

test

In [None]:
# create useful features for data mining tools to increase performance
# By transforming 'Date' into single values such as 'day' 'dayofweek' 'month'...
train['Date']=pd.to_datetime(train['Date'],format = '%Y-%m-%d')
train['day']=train['Date'].dt.day
train['month'] = train['Date'].dt.month
train['quarter'] = train['Date'].dt.quarter
train['dayofweek'] = train['Date'].dt.dayofweek
train['dayofyear'] = train['Date'].dt.dayofyear
train['weekofyear'] = train['Date'].dt.weekofyear
train=train.drop(['Date'],axis=1)
X=train.drop(['TargetValue'],axis=1)
y=train['TargetValue']
X.head()

In [None]:
# modify the features of test data also
# perform the predict model with good score
test['Date']=pd.to_datetime(test['Date'],format = '%Y-%m-%d')
test['day']=test['Date'].dt.day
test['month'] = test['Date'].dt.month
test['quarter'] = test['Date'].dt.quarter
test['dayofweek'] = test['Date'].dt.dayofweek
test['dayofyear'] = test['Date'].dt.dayofyear
test['weekofyear'] = test['Date'].dt.weekofyear
test=test.drop(['Date'],axis=1)
test.head()

## 4.1. The Model is Established and the Predicted Value is Output

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.23)

In [None]:
# gradient_model = make_pipeline(
#     GradientBoostingClassifier(n_estimators =70,max_depth=3, min_samples_leaf =1)
# )
# random_f_model = make_pipeline(
#     RandomForestClassifier(n_estimators =200,max_depth =9, min_samples_leaf=40)
# )
# neural_net_model = make_pipeline(
#     MLPClassifier(solver ='lbfgs',hidden_layer_sizes=(11,11),activation ='logistic')
# )
#
# knn_model = make_pipeline(
#         KNeighborsClassifier(n_neighbors=10)
# )
#
# models = [gradient_model,random_f_model,neural_net_model,knn_model]
#
# for i, m in enumerate(models):
#     m.fit(X_train, y_train)
#     print(metrics.accuracy_score(y_test, m.predict(X_test)))

In [None]:
# knn_model = make_pipeline(
#         KNeighborsClassifier(n_neighbors=10)
# )
# knn_model.fit(X_train, y_train)
# predictions = knn_model.predict(test)
# pd.Series(predictions).to_csv('output.txt', index=False, header=False)

## 4.2. Conduct k-fold Validation and Report Precision. Recall and Accuracy for Each Validation

In [None]:
#split data
kf =KFold(n_splits =10)

#create pipeline
dt_model = make_pipeline(
    tree.DecisionTreeClassifier()
)

#train model
#dt_model.fit(X_train,y_train)
iteration = 1
macro_precision =0
macro_recall =0
for train_index, test_index in kf.split(X):
    #split data
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    #train data
    dt_model.fit(X_train,y_train)
    y_predicted = dt_model.predict(X_test)
    print('Iteration #',iteration,':')
    iteration += 1
    print(classification_report(y_test,y_predicted))
    report = classification_report(y_test,y_predicted,output_dict=True)
    macro_precision += report['macro avg']['precision']
    macro_recall += report['macro avg']['recall']

    # print scores to check overfiting
    print('*******************')

# 5. Insights from Policy

## 5.1。 Distribution of policy types

In [None]:
policy_df = pd.read_csv('Policy.csv')
df_type=policy_df['type'].tolist()
len(df_type)

In [None]:
policy_df['type'].value_counts(dropna=True)

In [None]:
type_df = pd.DataFrame({'Policy': [15/61, 15/61 , 10/61,6/61,5/61,4/61,3/61,3/61]  } ,
                  index=['Immigration', 'Education', 'Health','Technology','Social','Alert','Administrative','Transformation'])
plot=type_df.plot.pie(y='Policy',figsize=(10,18))

# 5.2 Detailed Policy application Date

In [None]:
# pre-process
policy_df.head()


In [None]:
df_gov_policy=policy_df['gov_policy'].tolist()
policy_df['gov_policy'].value_counts(dropna=True)
# sort data by date

policy_sort = policy_df.sort_values('start_date')

In [None]:
#replace long terms by short-cut
policy_sort['gov_policy']=policy_sort['gov_policy'].map({'Special Immigration Procedure': 'SIP',
                             'School Opening with Online Class':'SOOC',
                             'Emergency Use Authorization of Diagnostic Kit':'WUA',
                             'School Opening Delay':'SOD',
                             'Social Distancing Campaign':'SDC',
                             'Infectious Disease Alert Level':'IDAL',
                             'Mask Distribution':'MD',
                             'Wearing of masks':'WM',
                             'Drive-Through Screening Center':'D-T',
                             'Electronic Wristbands':'EW',
                             'Self-Quarantine Safety Protection App':'S-Q',
                             'Self-Diagnosis App':'S-D',
                             'Open Data ':'OD',
                             'Extends Tightened Quarantine Measures ':'ET',
                             'Close bars and clubs':'CB',
                             'School Closure':'SC',
                             'Mandatory 14-day Self-Quarantine':'M14',
                             'Open API':'OA',
                             'Close karaoke':'CK',
                             'Logistics center':'LC',
                             'local government Administrative orders':'LG',
                             'KI-Pass: Korea Internet - Pass':'KI',
                             'Thanks to Challenge korea':'TC',
                             'Mandatory Self-Quarantine & Diagonostic Tests':'MS'


              })

In [None]:
pd.set_option('display.max_rows', None)
policy_sort['gov_policy']
#plt.plot(policy_df['gov_policy'],policy_df['start_date'])

In [None]:
plt.figure(figsize=(12, 17))
plt.title('Policy start date ')
plt.xlabel('Policy_name')
plt.ylabel('Time')
plt.scatter(policy_sort['gov_policy'],policy_sort['start_date'])
plt.plot(policy_sort['gov_policy'],policy_sort['start_date'])

# 6. Seoul Floating

In [None]:
df = pd.read_csv('SeoulFloating.csv')
df.head()

In [None]:
df.dropna()
df['date']=pd.to_datetime(df['date'],format = '%Y-%m-%d')
df_male = df[df['sex']=='male']
df_male.head()

In [None]:
plt.figure(figsize=(12, 4))
plt.xticks(rotation=25)
plt.title('Seoul_Floating Population ')
plt.xlabel('Time')
plt.ylabel('Population(millions)')
plt.plot(df['date'],df['fp_num'],'r.')
plt.plot(df_male['date'],df_male['fp_num'],'b.')
plt.legend(['Male and Female','Male'])
plt.show()

# 7. Tweet extracted with query COVID-19
#      Analysis Token and pattern

In [None]:
import collections
import nltk
from nltk.corpus import stopwords
import sys
print(sys.executable)
import re

In [None]:
!/opt/anaconda3/bin/python -m pip install wordcloud

In [None]:
from wordcloud import WordCloud

## 7.1. Cleaning +Processing Functions

In [None]:
# def helper factions to process extracted tweet file D2.txt
# mainly focused on remove url + punctuation + stopwords+ other languages
# split into list of words
def remove_url_punctuation(x):
    #get rid of url, punctuations, hashtags

    url_pattern= re.compile(r'https?://\S+|www\.\S+')
    replace_url =url_pattern.sub(r'',str(x))
    punctuation = re.compile(r'[^\w\s]')
    without_punctuation = punctuation.sub(r'',replace_url).lower()
    return without_punctuation

def split(x):
    #generate pieces of tokens from the entire text

    split_word_list = x.split(" ")
    return split_word_list

def remove_stopwords(x):
    #remove stop words
    global stop_words

    without_stopwords = []
    for word in x:
        if word not in stop_words and len(word)>2 and word != 'nan':
            without_stopwords.append(word)
    return without_stopwords

def detect_lang(x):
    #extract english tweet only

    from langdetect import detect
    try:
        lang = detect(x)
        return(lang)
    except:
        return("other")


## 7.2. Load tweets file

In [None]:
# you may need to rename the extrexcted D2.txt file by your default setting
# since it may not include extension .txt when its ouputed by tweet_extracter.py
df2 = pd.read_csv('D2.txt', sep='\t', names=['id','text'])

In [None]:
df2

## 7.3. Apply clearning functions to get tidy tweet

In [None]:
df2['tidy_tweet'] = df2['text'].apply(remove_url_punctuation)

print(df2['text'].head())
print("**********************")
print(df2['tidy_tweet'].head())


## 7.4. fliter  english only tweet


In [None]:
!/opt/anaconda3/bin/python -m pip install langdetect
df2['en']=df2['text'].apply(detect_lang)
print(df2['tidy_tweet'].head(10))
print("**********************")
df2 = df2[df2['en']=='en']
print(df2['tidy_tweet'].head(10))

## 7.5. Tokenize words

In [None]:
df2['word_list'] = df2['tidy_tweet'].apply(split)
print(df2['word_list'].head(10))

## 7.6. Remove Stop Words

In [None]:
nltk.download('stopwords')
global stop_words
stop_words =set(stopwords.words('english'))
df2['nlp_tweet'] =df2['word_list'].apply(remove_stopwords)
print(df2['word_list'].head(10))
print("**********************")
print(df2['nlp_tweet'].head(10))

## 7.7. Analysis Tokens (frequency)

In [None]:
#list of unique tokens
all_words_unique_list2 = (df2['nlp_tweet'].explode()).unique()
#num of unique tokens in D1
print (len(all_words_unique_list2))
word_list2 =list(df2['nlp_tweet'].explode())
# use NLTK to create a dic of words with frenquency
nltk_count2 = nltk.FreqDist(word_list2)
print(nltk_count2.most_common(100))

## 7.8. View as WordCloud

In [None]:
wordcloud = WordCloud().generate_from_frequencies(nltk_count2)
plt.figure(figsize=(10,7))
plt.imshow(wordcloud, interpolation  ='bilinear')
plt.axis('off')
plt.show()

# 8. Glance at SearchTrends of South Korean

In [None]:
st_df = pd.read_csv('SearchTrend.csv')

In [None]:
st_df

In [None]:
def range_of_date(df, date_col):
    data_col = pd.to_datetime(df[date_col]).dt.date
    date_range = (max(data_col) - min(data_col)).days + 1
    print(f'# {min(data_col)} to {max(data_col)}')

range_of_date(st_df,'date')

In [None]:
import seaborn
seaborn.set()
st_df['date']=pd.to_datetime(st_df['date'],format = '%Y-%m-%d')

plt.figure(figsize=(12, 70))
plt.xticks(rotation=25)
plt.title('Search Trends of Cold/flu/pneumonia/coronavirus ')
plt.xlabel('Time')
plt.ylabel('Ratio')

plt.plot(st_df['date'],st_df['cold'],'r.')
plt.plot(st_df['date'],st_df['flu'],'b.')
plt.plot(st_df['date'],st_df['pneumonia'],'g.')
plt.plot(st_df['date'],st_df['coronavirus'],'y.')


plt.legend(['cold','flu','pneumonia','coronavious'])
plt.show()

In [None]:
#As expected, there were only the spikes of seasonal flu and cold before COVID-19 outbreak

#Increase of searching pneumonia was prior to that of coronavirus(COVID-19)
    #- because it's called Wuhan pneumonia at first in S.Korea
      #  - Wuhan is the assumed place where COVID-19 pandemic started

In [None]:
# After 1st case in korean

In [None]:
after_first_case_df = st_df[st_df.date >= '2020-01-20']

In [None]:
plt.figure(figsize=(12, 70))
plt.xticks(rotation=25)
plt.title('Search Trends of Cold/flu/pneumonia/coronavirus ')
plt.xlabel('Time')
plt.ylabel('Ratio')

plt.plot(after_first_case_df['date'],after_first_case_df['cold'],'r.')
plt.plot(after_first_case_df['date'],after_first_case_df['flu'],'b.')
plt.plot(after_first_case_df['date'],after_first_case_df['pneumonia'],'g.')
plt.plot(after_first_case_df['date'],after_first_case_df['coronavirus'],'y.')


plt.legend(['cold','flu','pneumonia','coronavious'])
plt.show()

In [None]:
dd= pd.read_csv('train.csv')
dd.head()