In [1]:
import numpy as np 
import pandas as pd 

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor, XGBRFRegressor

In [2]:
train=pd.read_csv('../input/covid19-global-forecasting-week-5/train.csv')
test=pd.read_csv('../input/covid19-global-forecasting-week-5/test.csv')
submission=pd.read_csv('../input/covid19-global-forecasting-week-5/submission.csv')

In [3]:
RANDOM_STATE = 0

In [4]:
def FeatureEngineering(df):
  
    df.Date = pd.to_datetime(df.Date).dt.strftime("%Y%m%d").astype(int)
    df.Country_Region = df.apply(lambda x: x.Country_Region if pd.isnull(x.Province_State) else x.Province_State, axis=1)
    
    return df

df = FeatureEngineering(train)

df.head()

Unnamed: 0,Id,County,Province_State,Country_Region,Population,Weight,Date,Target,TargetValue
0,1,,,Afghanistan,27657145,0.058359,20200123,ConfirmedCases,0.0
1,2,,,Afghanistan,27657145,0.583587,20200123,Fatalities,0.0
2,3,,,Afghanistan,27657145,0.058359,20200124,ConfirmedCases,0.0
3,4,,,Afghanistan,27657145,0.583587,20200124,Fatalities,0.0
4,5,,,Afghanistan,27657145,0.058359,20200125,ConfirmedCases,0.0


In [5]:
X_train,X_val,y_train,y_val = train_test_split(train.iloc[:, 3:-1]
                                  , train['TargetValue']
                                  , test_size = 0.2
                                  , random_state = RANDOM_STATE)

X_train.head()

Unnamed: 0,Country_Region,Population,Weight,Date,Target
192508,Illinois,108669,0.086236,20200324,ConfirmedCases
320827,Michigan,28405,0.975195,20200212,Fatalities
429229,Nebraska,5246,1.167486,20200411,Fatalities
348155,Minnesota,23222,0.994738,20200427,Fatalities
28924,Guatemala,16176133,0.060244,20200209,ConfirmedCases


In [6]:
num_pipe = Pipeline([   
        ('imputer', SimpleImputer(strategy = 'median')), 
        ('scaler' , StandardScaler())
])

cat_pipe = Pipeline([ 
        ('imputer', SimpleImputer(strategy = 'constant', fill_value = 'missing')), 
        ('encoder' , OneHotEncoder(handle_unknown = 'ignore'))
])

Transformer = ColumnTransformer(
    n_jobs = -1
    , transformers = [
        ('num', num_pipe, X_train.select_dtypes(include = ['int64','float64']).columns)
        , ('cat', cat_pipe, X_train.select_dtypes(include = ['object']).columns)
    ])

Model = Pipeline([ 
        ('preprocessor', Transformer)
        , ('predictor', XGBRegressor(n_jobs = -1
                                        , random_state = RANDOM_STATE)) 
])

Model.fit(X_train, y_train)

Model.score(X_val, y_val)

0.9485563645292638

In [7]:
df = FeatureEngineering(test)

df.head()

Unnamed: 0,ForecastId,County,Province_State,Country_Region,Population,Weight,Date,Target
0,1,,,Afghanistan,27657145,0.058359,20200427,ConfirmedCases
1,2,,,Afghanistan,27657145,0.583587,20200427,Fatalities
2,3,,,Afghanistan,27657145,0.058359,20200428,ConfirmedCases
3,4,,,Afghanistan,27657145,0.583587,20200428,Fatalities
4,5,,,Afghanistan,27657145,0.058359,20200429,ConfirmedCases


In [8]:
pred = Model.predict(df[X_train.columns.tolist()])

df = pd.DataFrame({'Id': df.index, 'TargetValue': pred.tolist()})

In [9]:
q = ['0.05', '0.5', '0.95']
dfq = None

for i in q:
    k = (df.groupby(['Id'])['TargetValue'].quantile(q=float(i)).reset_index()
         .rename(columns = {'TargetValue':i})
        )
    if dfq is None:
        dfq = k
    else:
        dfq = pd.concat([dfq, k[i]], 1)
        
dfq.Id = dfq.Id + 1

dfq.head()

Unnamed: 0,Id,0.05,0.5,0.95
0,1,54.975441,54.975441,54.975441
1,2,7.828991,7.828991,7.828991
2,3,54.975441,54.975441,54.975441
3,4,7.828991,7.828991,7.828991
4,5,54.975441,54.975441,54.975441


In [10]:
df = (pd.melt(dfq, id_vars = ['Id'], value_vars = q)
      .rename(columns = {'value':'TargetValue'})
     )

df['ForecastId_Quantile'] = df['Id'].astype(str) + '_' + df['variable']

df[['ForecastId_Quantile','TargetValue']].to_csv('submission.csv', index = False)