In [None]:
import numpy as np
import pandas as pd

In [None]:
import matplotlib.pyplot as plt

In [None]:
%matplotlib inline

In [None]:
df = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [None]:
df = df.fillna('NaN')
df_test = df_test.fillna('NaN')

In [None]:
dates_countries = df.groupby(['Date', 'Country_Region'], as_index=False)\
                    .agg({'ConfirmedCases': 'sum', 'Fatalities': 'sum'})

dates_countries['Date'] = pd.to_datetime(dates_countries['Date'])
training_mask = np.logical_and(pd.to_datetime('2020-01-19') < dates_countries['Date'],
                               dates_countries['Date'] < pd.to_datetime('2020-03-19'))
data = dates_countries[training_mask].copy()
data.loc[:, ('ConfirmedCases', 'Fatalities')] = data.loc[:, ['ConfirmedCases', 'Fatalities']] \
                                                .apply(lambda x: np.log1p(x))
data.replace([np.inf, -np.inf], 0, inplace=True)

In [None]:
data.loc[:, ('ConfirmedCases', 'Fatalities')] = data.loc[:, ['ConfirmedCases', 'Fatalities']] \
                                                .apply(lambda x: np.log1p(x))
data.replace([np.inf, -np.inf], 0, inplace=True)

In [None]:
def lagging(df, lags=10):
    dfs = []
    for i in range(1, lags):
        lag_df = df.shift(i, fill_value=0)
        lag_df = lag_df.rename(lambda x: x + f'_{i}', axis=1)
        dfs.append(lag_df)
    
    return pd.concat(dfs, axis=1)

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()
data['Day_num'] = le.fit_transform(data.Date)
data['Day'] = data['Date'].dt.day
data['Month'] = data['Date'].dt.month

In [None]:
country_dfs = []
for country in ['Italy']:#dates_countries['Country_Region'].unique():
    country_mask = data['Country_Region'] == country
    country_df = data.loc[country_mask]
    lags_df = lagging(country_df[['ConfirmedCases', 'Fatalities']])
    country_df = pd.concat([country_df, lags_df], axis=1).dropna()

## ML Brute-force

In [None]:
def timeseries_train_test_split(X, y, test_size):
    """
        Perform train-test split with respect to time series structure
    """
    
    # get the index after which test set starts
    test_index = int(len(X)*(1-test_size))
    
    X_train = X.iloc[:test_index]
    y_train = y.iloc[:test_index]
    X_test = X.iloc[test_index:]
    y_test = y.iloc[test_index:]
    
    return X_train, X_test, y_train, y_test

In [None]:
from sklearn.model_selection import TimeSeriesSplit

def timeseriesCVscore(model, X, y, n_splits, loss_function):
    """
        Returns error on CV  
        
        params - vector of parameters for optimization
        series - dataset with timeseries
        slen - season length for Holt-Winters model
    """
    # errors array
    errors = []

    
    # set the number of folds for cross-validation
    tscv = TimeSeriesSplit(n_splits=n_splits) 
    
    # iterating over folds, train model on each, forecast and calculate error
    for train, test in tscv.split(X, y):
        start_time = time.time()
        model = model.fit(X[train], y[train])
        fit_time = time.time() - start_time
        
        y_pred = model.predict(X[test])
        y_true = y[test]
        error = loss_function(y_pred, y_true)
        score_time = time.time() - start_time - fit_time
        errors.append(error)
        
        cv_results['train_score'].append()
        cv_results['test_score'].append(error)
    
    res
    return np.mean(np.array(errors))

In [None]:
import seaborn as sns
import math

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss, hinge_loss, f1_score, precision_score
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from sklearn import model_selection
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.model_selection import train_test_split

In [None]:
import warnings
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import tree
import warnings

warnings.filterwarnings("ignore")

In [None]:
from tqdm import tqdm_notebook as tqdm

In [None]:
# data_pred[['Predicted_ConfirmedCases', 'Predicted_Fatalities']] = data_pred[['Predicted_ConfirmedCases', 'Predicted_Fatalities']].apply(lambda x: np.expm1(x))
# data_pred.replace([np.inf, -np.inf], 0, inplace=True) 

In [None]:
y_confirmed = country_df['ConfirmedCases'].to_numpy()
y_fatalities = country_df['Fatalities'].to_numpy()
X_train = country_df.drop(columns=['Date', 'Country_Region', 'ConfirmedCases', 'Fatalities'])

In [None]:
print(X_train.shape, y_confirmed.shape)

In [None]:
ML_methods = [
    ensemble.AdaBoostRegressor(),
    ensemble.BaggingRegressor(),
    ensemble.ExtraTreesRegressor(),
    ensemble.RandomForestRegressor(),
    ensemble.GradientBoostingRegressor(),
    ensemble.RandomForestRegressor(),
#     linear_model.LogisticRegression(),
    linear_model.PassiveAggressiveRegressor(),
    linear_model.Ridge(),
#     linear_model.SGDRegressor(),
#     linear_model.Perceptron(),
#     naive_bayes.BernoulliNB(),
#     naive_bayes.GaussianNB(),
    neighbors.KNeighborsRegressor(),
    svm.SVR(),
    svm.NuSVR(),
    svm.LinearSVR(),
#     discriminant_analysis.LinearDiscriminantAnalysis(),
#     discriminant_analysis.QuadraticDiscriminantAnalysis(),
    ]

ML_columns = ['ML Name', 'ML Parameters','ML Train Error Mean', 'ML Test Error Mean', 'ML Test Error 3*STD' ,'ML Time']
ML_compare = pd.DataFrame(columns = ML_columns)
ML_predict = {}
# rmsle = metrics.mean_squared_log_error
tscv = TimeSeriesSplit(n_splits=3) 

for row_index, clf in tqdm(enumerate(ML_methods)):
    print(f'Training: {clf.__class__.__name__}')
    cv_results = model_selection.cross_validate(clf, X_train, y_confirmed, cv=tscv, scoring='neg_mean_squared_error')

    ML_name = clf.__class__.__name__
    ML_compare.loc[row_index, 'ML Name'] = ML_name
    ML_compare.loc[row_index, 'ML Parameters'] = str(clf.get_params())
    ML_compare.loc[row_index, 'ML Time'] = cv_results['fit_time'].mean()
#     ML_compare.loc[row_index, 'ML Train Error Mean'] = -1 * cv_results['train_score'].mean()
    ML_compare.loc[row_index, 'ML Test Error Mean'] = -1 * cv_results['test_score'].mean()   
    ML_compare.loc[row_index, 'ML Test Error 3*STD'] = 3 * cv_results['test_score'].std()

    clf = clf.fit(X_train, y_confirmed)
    ML_predict[ML_name] = clf.predict(X_train)

In [None]:
ML_compare.sort_values(by = ['ML Test Error Mean'], ascending = True, inplace = True)
ML_compare

In [None]:
plt.figure(figsize=(15, 7))
sns.barplot(x='ML Test Error Mean', y = 'ML Name', data = ML_compare, color = 'm')
plt.title('Machine Learning Algorithms Score \n')
plt.xlabel('RMSLE Score')
plt.ylabel('Algorithm')