# Importing data

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/covid19-local-us-ca-forecasting-week-1/ca_submission.csv
/kaggle/input/covid19-local-us-ca-forecasting-week-1/ca_train.csv
/kaggle/input/covid19-local-us-ca-forecasting-week-1/ca_test.csv


In [2]:
import pandas as pd
sample_submission = pd.read_csv("/kaggle/input/covid19-local-us-ca-forecasting-week-1/ca_submission.csv")
test = pd.read_csv("/kaggle/input/covid19-local-us-ca-forecasting-week-1/ca_test.csv")
train = pd.read_csv("/kaggle/input/covid19-local-us-ca-forecasting-week-1/ca_train.csv")

# Check data

In [3]:
len(train)


63

In [4]:
sample_submission.head()

Unnamed: 0,ForecastId,ConfirmedCases,Fatalities
0,1,1,1
1,2,1,1
2,3,1,1
3,4,1,1
4,5,1,1


In [5]:
test.head()

Unnamed: 0,ForecastId,Province/State,Country/Region,Lat,Long,Date
0,1,California,US,36.1162,-119.6816,2020-03-12
1,2,California,US,36.1162,-119.6816,2020-03-13
2,3,California,US,36.1162,-119.6816,2020-03-14
3,4,California,US,36.1162,-119.6816,2020-03-15
4,5,California,US,36.1162,-119.6816,2020-03-16


In [6]:
train.tail()

Unnamed: 0,Id,Province/State,Country/Region,Lat,Long,Date,ConfirmedCases,Fatalities
58,59,California,US,36.1162,-119.6816,2020-03-20,1177.0,23.0
59,60,California,US,36.1162,-119.6816,2020-03-21,1364.0,24.0
60,61,California,US,36.1162,-119.6816,2020-03-22,1642.0,30.0
61,62,California,US,36.1162,-119.6816,2020-03-23,2108.0,39.0
62,63,California,US,36.1162,-119.6816,2020-03-24,2538.0,50.0


### Heatmap over California which will give a better picture as following weeks progress 

In [7]:
#make a heatmap

import folium
from folium import Choropleth, Marker
from folium.plugins import HeatMap, MarkerCluster
m = folium.Map(location=[37, -115], zoom_start=6) 
def embed_map(m, file_name):
    from IPython.display import IFrame
    m.save(file_name)
    return IFrame(file_name, width='100%', height='750px')

#merge test and training data
Full_data = pd.merge(test, train, on=['Lat','Long','Date'])

# Add a heatmap to the base map
HeatMap(data=Full_data[['Lat', 'Long']], radius=11).add_to(m)

# Show the map
embed_map(m, "q_1.html")

# Data cleaning

In [8]:
#rename therefor the data columns
train.rename(columns={'Province/State':'Province'}, inplace=True)
train.rename(columns={'Country/Region':'Country'}, inplace=True)
train.rename(columns={'ConfirmedCases':'Confirmed'}, inplace=True)

In [9]:
#and we do the same for test set
test.rename(columns={'Province/State':'Province'}, inplace=True)
test.rename(columns={'Country/Region':'Country'}, inplace=True)

## Label encoding

In [10]:
from sklearn.preprocessing import LabelEncoder
# creating initial dataframe
bridge_types = ('Lat', 'Date', 'Province', 'Country', 'Long', 'Confirmed',
       'ForecastId', 'Id')
countries = pd.DataFrame(train, columns=['Country'])
# creating instance of labelencoder
labelencoder = LabelEncoder()
# Assigning numerical values and storing in another column
train['Countries'] = labelencoder.fit_transform(train['Country'])

#do the same for test set
test['Countries'] = labelencoder.fit_transform(test['Country'])

#check label encoding 
train['Countries'].head()


0    0
1    0
2    0
3    0
4    0
Name: Countries, dtype: int64

## Handling dates

In [11]:
train['Date']= pd.to_datetime(train['Date']) 
test['Date']= pd.to_datetime(test['Date']) 

In [12]:
train = train.set_index(['Date'])
test = test.set_index(['Date'])

In [13]:
def create_time_features(df):
    """
    Creates time series features from datetime index
    """
    df['date'] = df.index
    df['hour'] = df['date'].dt.hour
    df['dayofweek'] = df['date'].dt.dayofweek
    df['quarter'] = df['date'].dt.quarter
    df['month'] = df['date'].dt.month
    df['year'] = df['date'].dt.year
    df['dayofyear'] = df['date'].dt.dayofyear
    df['dayofmonth'] = df['date'].dt.day
    df['weekofyear'] = df['date'].dt.weekofyear
    
    X = df[['hour','dayofweek','quarter','month','year',
           'dayofyear','dayofmonth','weekofyear']]
    return X

In [14]:
create_time_features(train).head()
create_time_features(test).head()

Unnamed: 0_level_0,hour,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-03-12,0,3,1,3,2020,72,12,11
2020-03-13,0,4,1,3,2020,73,13,11
2020-03-14,0,5,1,3,2020,74,14,11
2020-03-15,0,6,1,3,2020,75,15,11
2020-03-16,0,0,1,3,2020,76,16,12


In [15]:
train.head()

Unnamed: 0_level_0,Id,Province,Country,Lat,Long,Confirmed,Fatalities,Countries,date,hour,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2020-01-22,1,California,US,36.1162,-119.6816,0.0,0.0,0,2020-01-22,0,2,1,1,2020,22,22,4
2020-01-23,2,California,US,36.1162,-119.6816,0.0,0.0,0,2020-01-23,0,3,1,1,2020,23,23,4
2020-01-24,3,California,US,36.1162,-119.6816,0.0,0.0,0,2020-01-24,0,4,1,1,2020,24,24,4
2020-01-25,4,California,US,36.1162,-119.6816,0.0,0.0,0,2020-01-25,0,5,1,1,2020,25,25,4
2020-01-26,5,California,US,36.1162,-119.6816,0.0,0.0,0,2020-01-26,0,6,1,1,2020,26,26,4


## Dropping useless features

In [16]:
train.drop("date", axis=1, inplace=True)
test.drop("date", axis=1, inplace=True)

In [17]:
# train.isnull().sum()

In [18]:
#drop useless columns for train and test set
train.drop(['Country'], axis=1, inplace=True)
train.drop(['Province'], axis=1, inplace=True)

In [19]:
test.drop(['Country'], axis=1, inplace=True)
test.drop(['Province'], axis=1, inplace=True)

# Model 

In [20]:
from sklearn.tree import DecisionTreeRegressor  
regressor = DecisionTreeRegressor(random_state = 0) 

In [21]:
# import xgboost as xgb
# from xgboost import plot_importance, plot_tree
from sklearn.metrics import mean_squared_error, mean_absolute_error

# reg= xgb.XGBRegressor(n_estimators=1000)

In [22]:
train.head()

Unnamed: 0_level_0,Id,Lat,Long,Confirmed,Fatalities,Countries,hour,dayofweek,quarter,month,year,dayofyear,dayofmonth,weekofyear
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-01-22,1,36.1162,-119.6816,0.0,0.0,0,0,2,1,1,2020,22,22,4
2020-01-23,2,36.1162,-119.6816,0.0,0.0,0,0,3,1,1,2020,23,23,4
2020-01-24,3,36.1162,-119.6816,0.0,0.0,0,0,4,1,1,2020,24,24,4
2020-01-25,4,36.1162,-119.6816,0.0,0.0,0,0,5,1,1,2020,25,25,4
2020-01-26,5,36.1162,-119.6816,0.0,0.0,0,0,6,1,1,2020,26,26,4


In [23]:
# features that will be used in the model
x = train[['Lat', 'Long','Countries','dayofweek','month','dayofyear','weekofyear']]
y1 = train[['Confirmed']]
y2 = train[['Fatalities']]
x_test = test[['Lat', 'Long','Countries','dayofweek','month','dayofyear','weekofyear']]

In [24]:
x.head()

Unnamed: 0_level_0,Lat,Long,Countries,dayofweek,month,dayofyear,weekofyear
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2020-01-22,36.1162,-119.6816,0,2,1,22,4
2020-01-23,36.1162,-119.6816,0,3,1,23,4
2020-01-24,36.1162,-119.6816,0,4,1,24,4
2020-01-25,36.1162,-119.6816,0,5,1,25,4
2020-01-26,36.1162,-119.6816,0,6,1,26,4


In [25]:
#use model on data 
regressor.fit(x,y1)
predict_1 = regressor.predict(x_test)
predict_1 = pd.DataFrame(predict_1)
predict_1.columns = ["Confirmed_predict"]

In [26]:
predict_1.head()

Unnamed: 0,Confirmed_predict
0,221.0
1,282.0
2,340.0
3,426.0
4,557.0


In [27]:
#use model on data 
regressor.fit(x,y2)
predict_2 = regressor.predict(x_test)
predict_2 = pd.DataFrame(predict_2)
predict_2.columns = ["Death_prediction"]
predict_2.head()

Unnamed: 0,Death_prediction
0,4.0
1,4.0
2,5.0
3,6.0
4,7.0


In [28]:
# plot = plot_importance(regressor, height=0.9, max_num_features=20)

# Submission

In [29]:
Samle_submission = pd.read_csv("/kaggle/input/covid19-local-us-ca-forecasting-week-1/ca_submission.csv")
Samle_submission.columns
submission = Samle_submission[["ForecastId"]]

In [30]:
Final_submission = pd.concat([predict_1,predict_2,submission],axis=1)
Final_submission.head()

Unnamed: 0,Confirmed_predict,Death_prediction,ForecastId
0,221.0,4.0,1
1,282.0,4.0,2
2,340.0,5.0,3
3,426.0,6.0,4
4,557.0,7.0,5


In [31]:
Final_submission.columns = ['ConfirmedCases', 'Fatalities', 'ForecastId']
Final_submission = Final_submission[['ForecastId','ConfirmedCases', 'Fatalities']]

Final_submission["ConfirmedCases"] = Final_submission["ConfirmedCases"].astype(int)
Final_submission["Fatalities"] = Final_submission["Fatalities"].astype(int)

In [32]:
Final_submission.head()

Unnamed: 0,ForecastId,ConfirmedCases,Fatalities
0,1,221,4
1,2,282,4
2,3,340,5
3,4,426,6
4,5,557,7


In [33]:
Final_submission.to_csv("submission.csv",index=False)
print('Model ready for submission!')

Model ready for submission!
