In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.
from sklearn.model_selection import train_test_split

from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.impute import SimpleImputer

from shapely.geometry import Point,Polygon
import requests 

/kaggle/input/covid19-global-forecasting-week-4/submission.csv
/kaggle/input/covid19-global-forecasting-week-4/test.csv
/kaggle/input/covid19-global-forecasting-week-4/train.csv


**Loading Training and Testing Data**

In [2]:
train_data = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-4/train.csv')
test_data = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-4/test.csv')
submission_csv = pd.read_csv('/kaggle/input/covid19-global-forecasting-week-4/submission.csv')

**Convert String Datetime to python datetime**

In [3]:
convert_dict = {'Province_State': str,'Country_Region':str,'ConfirmedCases':int,'Fatalities':int}
convert_dict_test = {'Province_State': str,'Country_Region':str}
train_data = train_data.astype(convert_dict)
test_data = test_data.astype(convert_dict_test)

In [4]:
train_data['Date'] = pd.to_datetime(train_data['Date'], infer_datetime_format=True)
test_data['Date'] = pd.to_datetime(test_data['Date'], infer_datetime_format=True)

In [5]:
train_data.loc[:, 'Date'] = train_data.Date.dt.strftime('%m%d')
train_data.loc[:, 'Date'] = train_data['Date'].astype(int)

test_data.loc[:, 'Date'] = test_data.Date.dt.strftime('%m%d')
test_data.loc[:, 'Date'] = test_data['Date'].astype(int)

In [6]:
train_data['Country_Region'] = np.where(train_data['Province_State'] == 'nan',train_data['Country_Region'],train_data['Province_State']+' '+train_data['Country_Region'])
test_data['Country_Region'] = np.where(test_data['Province_State'] == 'nan',test_data['Country_Region'],test_data['Province_State']+' '+test_data['Country_Region'])

#train_data['Province_State'] = np.where(train_data['Province_State'] == 'nan',train_data['Country_Region'],train_data['Province_State']+train_data['Country_Region'])
#test_data['Province_State'] = np.where(test_data['Province_State'] == 'nan',test_data['Country_Region'],test_data['Province_State']+test_data['Country_Region'])




In [7]:
train_data = train_data.drop(columns=['Province_State'])
test_data = test_data.drop(columns=['Province_State'])

In [8]:
test_data.head(2)

Unnamed: 0,ForecastId,Country_Region,Date
0,1,Afghanistan,402
1,2,Afghanistan,403


**Label Encoding Country**

In [9]:
#get list of categorical variables
s = (train_data.dtypes == 'object')
object_cols = list(s[s].index)

In [10]:
from sklearn.preprocessing import LabelEncoder

**Try using Label Encoder**

In [11]:
label_encoder1 = LabelEncoder()
label_encoder2 = LabelEncoder()

#train_data['Province_State'] = label_encoder1.fit_transform(train_data['Province_State'])
#test_data['Province_State'] = label_encoder1.transform(test_data['Province_State'])

train_data['Country_Region'] = label_encoder2.fit_transform(train_data['Country_Region'])
test_data['Country_Region'] = label_encoder2.transform(test_data['Country_Region'])

    

In [12]:
train_data.head(2)

Unnamed: 0,Id,Country_Region,Date,ConfirmedCases,Fatalities
0,1,0,122,0,0
1,2,0,123,0,0


In [13]:
test_data.head(2)

Unnamed: 0,ForecastId,Country_Region,Date
0,1,0,402
1,2,0,403


In [14]:
Test_id = test_data.ForecastId

In [15]:
train_data.drop(['Id'], axis=1, inplace=True)
test_data.drop('ForecastId', axis=1, inplace=True)

**Check missing value**

In [16]:
missing_val_count_by_column = (train_data.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column>0])

Series([], dtype: int64)


**Make model XGBRegressor**

In [17]:
from xgboost import XGBRegressor

In [18]:
train_data.head(1)

Unnamed: 0,Country_Region,Date,ConfirmedCases,Fatalities
0,0,122,0,0


In [19]:
X_train = train_data[['Country_Region','Date']]
y_train = train_data[['ConfirmedCases', 'Fatalities']]

In [20]:
x_train = X_train.iloc[:,:].values
x_test = test_data.iloc[:,:].values

**Splitting data train/test**

In [21]:
#from sklearn.metrics import mean_squared_error

In [22]:
#X_train,X_test,Y_train,Y_test = train_test_split(X_train, y_train, test_size=0.2, random_state=42, shuffle=True)

In [23]:
error_list = []
def return_error(estimator, x_train,x_test,y_train):
    model = MultiOutputRegressor(XGBRegressor(n_estimators=estimator, random_state=0, max_depth=20))
    model.fit(x_train, y_train)

    predict = MultiOutputRegressor(model.predict(x_test))
    
    #error = mean_squared_error( y_test.values, predict.estimator)
    #error_list.append(error)
    
    return predict

In [24]:
#from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [25]:
#num_estimators = [1000,1100,1200,1250,1300]
#learn_rates = [0.02,0.05,0.06,0.07]

#param_grid = {'n_estimators':num_estimators,
 #             'learning_rate':learn_rates
#            }

In [26]:
#random_search = GridSearchCV(XGBRegressor(loss='huber'), param_grid,cv=3,return_train_score=True, n_jobs=1)

In [27]:
#random_search.fit(x_train, y_train.Fatalities)

In [28]:
#random_search.best_params_

In [29]:
#estimator_list = [1200,1250,1300,1350]
#for value in estimator_list:
#    error_ = return_error(value, X_train,X_test,Y_train,Y_test)

In [30]:
predict = return_error(1400,x_train,x_test,y_train)

**Submission**

In [31]:
df_sub = pd.DataFrame()
df_sub['ForecastId'] = Test_id
df_sub['ConfirmedCases'] = np.round(predict.estimator[:,0],0)
df_sub['Fatalities'] = np.round(predict.estimator[:,1],0)

df_sub.to_csv('submission.csv', index=False)

In [32]:
df_sub

Unnamed: 0,ForecastId,ConfirmedCases,Fatalities
0,1,273.0,6.0
1,2,281.0,6.0
2,3,299.0,7.0
3,4,349.0,7.0
4,5,367.0,11.0
...,...,...,...
13454,13455,17.0,3.0
13455,13456,17.0,3.0
13456,13457,17.0,3.0
13457,13458,17.0,3.0
