# COVID19 (prediction)

In [1]:
# импорт библиотек

import pandas as pd
import datetime as dt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31926 entries, 0 to 31925
Data columns (total 6 columns):
Id                31926 non-null int64
Province_State    13566 non-null object
Country_Region    31926 non-null object
Date              31926 non-null object
ConfirmedCases    31926 non-null float64
Fatalities        31926 non-null float64
dtypes: float64(2), int64(1), object(3)
memory usage: 1.5+ MB


## Подготовка данных 

In [4]:
empty = 'empty_val'

def preparation(df): # функция переименовывает столбцы,  заполняет пустые значения, изменяет тип данных в датасете
    df.rename(columns={'Province_State': 'State', 'Country_Region': 'Country'}, inplace=True)
    df.fillna(empty, inplace=True)
    df['Date'] = df['Date'].astype('datetime64[ns]')
    return df

train = preparation(train)
test = preparation(test)

In [5]:
# замена строковых значений Country и State с помощью LabelEncoder
le_country = preprocessing.LabelEncoder()

train['Country'] = le_country.fit_transform(train['Country'])
test['Country'] = le_country.fit_transform(test['Country'])

le_state = preprocessing.LabelEncoder()

train['State'] = le_country.fit_transform(train['State'])
test['State'] = le_country.fit_transform(test['State'])

In [6]:
def create_features(df): # функция создает новые фичи: день, месяц, день недели, день года, квартал, номер недели
    df['Day'] = df['Date'].dt.day
    df['Month'] = df['Date'].dt.month
    df['Day_of_week'] = df['Date'].dt.dayofweek
    df['Day_of_year'] = df['Date'].dt.dayofyear
    df['Quarter'] = df['Date'].dt.quarter
    df['Week_of_year'] = df['Date'].dt.weekofyear
    return df

train = create_features(train)
test = create_features(test)

In [7]:
columns = ['State', 'Country', 'Day', 'Month', 'Day_of_week', 'Day_of_year', 'Quarter', 'Week_of_year']
X = train[columns]

y_confirmed_cases = train[['ConfirmedCases']]
y_fatalities = train[['Fatalities']]

In [8]:
test = test.drop(['Date', 'ForecastId'], axis=1)

## Обучение модели

In [9]:
clf = XGBRegressor()

parameters = {'nthread':[1, 2, 3, 4], 
              'learning_rate': [.03, 0.05, .07], 
              'max_depth': [3, 5, 7],
              'min_child_weight': [3, 4, 5],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [100, 250, 500]}

In [10]:
grid_search_cv = GridSearchCV(clf, parameters, cv=5, n_jobs=-1)

In [11]:
grid_search_cv.fit(X, y_confirmed_cases)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constrain...
                                    subsample=None, tree_method=None,
                                    validate_parameters=False, verbosity=None),
             iid='warn', n_jobs=-1,
             param_grid={'colsample_bytree': [0.7],
                         'learning_rate': [0.03, 0.05, 0.07],
                         'max

In [12]:
best_clf_cc = grid_search_cv.best_estimator_

In [13]:
grid_search_cv.best_params_

{'colsample_bytree': 0.7,
 'learning_rate': 0.03,
 'max_depth': 3,
 'min_child_weight': 5,
 'n_estimators': 100,
 'nthread': 1,
 'silent': 1,
 'subsample': 0.7}

In [14]:
best_clf_cc.fit(X, y_confirmed_cases)

XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.03, max_delta_step=0, max_depth=3,
             min_child_weight=5, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=1, nthread=1, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, silent=1, subsample=0.7,
             tree_method=None, validate_parameters=False, verbosity=None)

In [20]:
y_pred_cc = best_clf_cc.predict(test)

In [21]:
best_clf_cc.fit(X, y_fatalities)

XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.03, max_delta_step=0, max_depth=3,
             min_child_weight=5, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=1, nthread=1, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, silent=1, subsample=0.7,
             tree_method=None, validate_parameters=False, verbosity=None)

In [22]:
y_pred_f = best_clf_cc.predict(test)

In [27]:
forecastId = pd.read_csv('test.csv')
forecastId = forecastId['ForecastId']

In [28]:
final = pd.DataFrame({'ForecastId': forecastId, 'ConfirmedCases': y_pred_cc, 'Fatalities': y_pred_f})

In [29]:
final.to_csv('submission.csv', index=False)