In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.preprocessing import LabelEncoder
from xgboost import XGBRegressor
from tqdm.notebook import tqdm

from warnings import filterwarnings
filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


/kaggle/input/covid19-global-forecasting-week-3/train.csv
/kaggle/input/covid19-global-forecasting-week-3/submission.csv
/kaggle/input/covid19-global-forecasting-week-3/test.csv


In [2]:
trainDf = pd.read_csv("/kaggle/input/covid19-global-forecasting-week-3/train.csv", parse_dates=['Date'])
testDf  = pd.read_csv("/kaggle/input/covid19-global-forecasting-week-3/test.csv", parse_dates=['Date'])  

# get rid of nan
trainDf['Province_State'] = trainDf['Province_State'].fillna('')
testDf['Province_State']  = testDf['Province_State'].fillna('')

In [3]:
countries = dict()
for c in trainDf['Country_Region'].unique():
    countryDf = trainDf[trainDf['Country_Region'] == c]
    countries[c] = countryDf['Province_State'].unique()

In [4]:
testDf['ConfirmedCases'] = 0
testDf['Fatalities']     = 0

In [5]:
saveDf = testDf.copy()
for country in tqdm(countries):
    for state in countries[country]:
        # select data for each country
        trainDate = trainDf[(trainDf['Province_State'] == state)&(trainDf['Country_Region'] == country)]['Date'].astype(np.int64).values
        trainConf = trainDf[(trainDf['Province_State'] == state)&(trainDf['Country_Region'] == country)]['ConfirmedCases'].astype(np.int64).values
        trainDead = trainDf[(trainDf['Province_State'] == state)&(trainDf['Country_Region'] == country)]['Fatalities'].astype(np.int64).values        
        testDate  = testDf[(testDf['Province_State'] == state)&(testDf['Country_Region'] == country)]['Date'].astype(np.int64).values
        predId    = testDf[(testDf['Province_State'] == state)&(testDf['Country_Region']  == country)]['ForecastId'].astype('int')
        
        #reshape forxgboost
        testDate = testDate.reshape(-1,1)
        trainDate = trainDate.reshape(-1,1)
        trainConf = trainConf.reshape(-1,1)
        trainDead = trainDead.reshape(-1,1)
        
        # confirmed
        confModel = XGBRegressor(n_estimators=2000)
        
        confModel.fit(trainDate, trainConf)
        predConf = confModel.predict(testDate)
        
        # dead
        deadModel = XGBRegressor(n_estimators=2000)
        
        deadModel.fit(trainDate, trainDead)
        predDead = deadModel.predict(testDate)
        
         # save results
        tempDf = saveDf[(testDf['Province_State'] == state)&(testDf['Country_Region']  == country)]
        tempDf['Fatalities']     = predDead
        tempDf['ConfirmedCases'] = predConf
        saveDf[(testDf['Province_State'] == state)&(testDf['Country_Region']  == country)] = tempDf

HBox(children=(FloatProgress(value=0.0, max=180.0), HTML(value='')))




In [6]:
output=pd.DataFrame()
output['ForecastId']     = saveDf['ForecastId'].astype(np.int32)
output['ConfirmedCases'] = saveDf['ConfirmedCases'].astype(np.int32)
output['Fatalities']     = saveDf['Fatalities'].astype(np.int32)
output.to_csv('submission.csv', index=False)
output

Unnamed: 0,ForecastId,ConfirmedCases,Fatalities
0,1,94,3
1,2,109,3
2,3,109,3
3,4,119,3
4,5,170,3
...,...,...,...
13153,13154,8,0
13154,13155,8,0
13155,13156,8,0
13156,13157,8,0
