In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
df = pd.read_csv('aggregate_covid_country.csv')

In [3]:
df.head()

Unnamed: 0,Date,Country,Confirmed,Recovered,Deaths
0,2020-01-22,Afghanistan,0,0,0
1,2020-01-22,Albania,0,0,0
2,2020-01-22,Algeria,0,0,0
3,2020-01-22,Andorra,0,0,0
4,2020-01-22,Angola,0,0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23688 entries, 0 to 23687
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Date       23688 non-null  object
 1   Country    23688 non-null  object
 2   Confirmed  23688 non-null  int64 
 3   Recovered  23688 non-null  int64 
 4   Deaths     23688 non-null  int64 
dtypes: int64(3), object(2)
memory usage: 925.4+ KB


In [5]:
le = LabelEncoder()

df['Country'] = le.fit_transform(df['Country'])

In [6]:
df

Unnamed: 0,Date,Country,Confirmed,Recovered,Deaths
0,2020-01-22,0,0,0,0
1,2020-01-22,1,0,0,0
2,2020-01-22,2,0,0,0
3,2020-01-22,3,0,0,0
4,2020-01-22,4,0,0,0
...,...,...,...,...,...
23683,2020-05-26,183,429,365,3
23684,2020-05-26,184,9,6,1
23685,2020-05-26,185,249,10,49
23686,2020-05-26,186,920,336,7


In [7]:
df['Date'] = le.fit_transform(df['Date'])

In [8]:
df

Unnamed: 0,Date,Country,Confirmed,Recovered,Deaths
0,0,0,0,0,0
1,0,1,0,0,0
2,0,2,0,0,0
3,0,3,0,0,0
4,0,4,0,0,0
...,...,...,...,...,...
23683,125,183,429,365,3
23684,125,184,9,6,1
23685,125,185,249,10,49
23686,125,186,920,336,7


In [9]:
from scipy.stats import zscore
import numpy as np
z = np.abs(zscore(df))
np.where(z>3)

(array([ 7932,  8120,  8308,  8496,  8684,  8872,  9060,  9248,  9436,
         9624,  9812, 10000, 10188, 10376, 10564, 10752, 10940, 11128,
        11316, 11504, 11692, 11880, 12068, 12256, 12444, 12632, 12820,
        13008, 13146, 13196, 13245, 13334, 13384, 13433, 13522, 13572,
        13621, 13710, 13760, 13809, 13898, 13948, 13997, 14070, 14086,
        14136, 14185, 14258, 14274, 14274, 14324, 14373, 14446, 14462,
        14462, 14512, 14561, 14634, 14634, 14650, 14650, 14700, 14730,
        14749, 14822, 14822, 14838, 14838, 14888, 14914, 14918, 14937,
        15010, 15010, 15026, 15026, 15076, 15102, 15106, 15125, 15198,
        15198, 15214, 15214, 15264, 15290, 15294, 15313, 15386, 15386,
        15402, 15402, 15452, 15478, 15482, 15501, 15574, 15574, 15590,
        15590, 15594, 15640, 15666, 15670, 15685, 15689, 15762, 15762,
        15778, 15778, 15782, 15828, 15854, 15858, 15873, 15877, 15950,
        15950, 15966, 15966, 15966, 15970, 16016, 16042, 16046, 16061,
      

In [10]:
z[7932][3]

3.13147210893684

In [11]:
z[23678][4]

9.167424195549778

In [12]:
df_new = df[(z<=3).all(axis=1)]

In [13]:
df.shape

(23688, 5)

In [14]:
df_new.shape

(23177, 5)

In [15]:
df.skew()


Date          0.000000
Country       0.000000
Confirmed    18.674502
Recovered    10.662286
Deaths       14.218167
dtype: float64

In [16]:
from sklearn.preprocessing import power_transform

In [17]:
X = df.drop(['Deaths'], axis=1)
y = df['Deaths']

In [18]:
X = pd.DataFrame(power_transform(X), columns=X.columns)

In [19]:
sc = StandardScaler()
X = pd.DataFrame(sc.fit_transform(X), columns=X.columns)

In [20]:
X.head()

Unnamed: 0,Date,Country,Confirmed,Recovered
0,-2.08504,-2.109559,-1.125252,-0.931988
1,-2.08504,-2.052431,-1.125252,-0.931988
2,-2.08504,-2.003294,-1.125252,-0.931988
3,-2.08504,-1.958719,-1.125252,-0.931988
4,-2.08504,-1.917256,-1.125252,-0.931988


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3 ,random_state=42)

In [22]:
X_train.shape

(16581, 4)

In [23]:
y_train.shape

(16581,)

In [24]:
lm = LinearRegression()

In [25]:
lm.fit(X_train, y_train)

LinearRegression()

In [26]:
lm.coef_

array([-519.60348942,  245.97666398, 1196.64167178,  151.18655951])

In [27]:
lm.intercept_

511.96693486239457

In [28]:
lm.score(X_train, y_train)

0.06866235853631264

In [29]:
pred = lm.predict(X_test)
print('Predicted result price: ', pred)
print('actual price: ', y_test)

Predicted result price:  [-423.02624992  -65.17158219 -410.28158296 ... -605.16053325  935.32173121
 -669.57800774]
actual price:  2300        0
14284       0
7298        0
7733        0
19527    1784
         ... 
1330        0
14878      24
9431        0
16168      30
4744        0
Name: Deaths, Length: 7107, dtype: int64


In [30]:
print('Error: ')

print('Mean absolute error: ', mean_absolute_error(y_test,pred))
print('Mean squared error: ', mean_squared_error(y_test,pred))


print('Root mean square error: ', np.sqrt(mean_squared_error(y_test, pred)))

Error: 
Mean absolute error:  1197.8302703836282
Mean squared error:  17154142.498526435
Root mean square error:  4141.755968007584


In [31]:
from sklearn.metrics import r2_score

print(r2_score(y_test, pred))

0.06831716067385551


In [32]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.datasets import make_regression

In [33]:
X, y = make_regression(n_features=4, n_informative=2, random_state=83, shuffle=False)

In [38]:
regr = AdaBoostRegressor(random_state=56, n_estimators=100)

In [39]:
regr.fit(X, y)

AdaBoostRegressor(n_estimators=100, random_state=56)

In [40]:
regr.score(X_train, y_train)

-0.015310209016405674

In [42]:
#Defining function for best random_state
def get_best_rstate(r,model,x,y,test_size=0.25):
    best_rState = 0
    best_r2Score = 0
    for i in r:
        x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=test_size,random_state=i)
        
        model.fit(x_train,y_train)
        predict_y = model.predict(x_test)
        
        temp_r2Score = r2_score(y_test,predict_y)
        if temp_r2Score>best_r2Score:
            best_r2Score = temp_r2Score
            best_rState = i
            
    return best_rState,best_r2Score


#Defining function for best CV
def get_best_cv(model,parameters,x_train,y_train,r=range(2,20)):
    best_cv = 0
    best_cvScore = 0
    for i in r:
        gscv = GridSearchCV(model,parameters)
        gscv.fit(x_train,y_train)
        
        temp_cvScore = cross_val_score(gscv.best_estimator_,x_train,y_train,cv=i).mean()
        if temp_cvScore>best_cvScore:
            best_cvScore = temp_cvScore
            best_cv = i
            
    return best_cv,best_cvScore

#Defining function for building models
def build_model(models,x,y,r_range=range(100),t_size=0.25,cv_range=range(2,20)):
    for i in models:
        #Finding the best random_state for train test split
        best_rState, best_r2Score = get_best_rstate(r_range,models[i]["name"],x,y)
        
        #Splitting train test data with best random_state
        x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=t_size,random_state=best_rState)
        
        #Hypertuning Parameters
        #Finding best CV
        best_cv, best_cvScore = get_best_cv(models[i]["name"],models[i]["parameters"],x_train,y_train,cv_range)
        
        #Building final model with hypertuned parameters
        gscv = GridSearchCV(models[i]["name"],models[i]["parameters"],cv=best_cv)
        gscv.fit(x_train,y_train)
        
        #Checking Final Performance of the model
        predict_y = gscv.best_estimator_.predict(x_test)
        r2Score = r2_score(y_test,predict_y)
        mse = mean_squared_error(y_test,predict_y)
        mae = mean_absolute_error(y_test,predict_y)
        
        #Storing model specs.
        models[i]["random_state"] = best_rState
        models[i]["initial_r2_score"] = best_r2Score
        models[i]["x_train"] = x_train
        models[i]["x_test"] = x_test
        models[i]["y_train"] = y_train
        models[i]["y_test"] = y_test
        models[i]["cv"] = best_cv
        models[i]["cross_val_score"] = best_cvScore
        models[i]["gscv"] = gscv
        models[i]["predict_y"] = predict_y
        models[i]["r2_score"] = r2Score
        models[i]["mse"] = mse
        models[i]["rmse"] = np.sqrt(mse)
        models[i]["mae"] = mae
        
    return models;

In [44]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

In [45]:
import warnings
warnings.simplefilter('ignore')
from sklearn.linear_model import LinearRegression, Lasso, Ridge, SGDRegressor
from sklearn.ensemble import AdaBoostRegressor
#Preparing List of Models with parameters
models = {
    "LinearRegression":{
        "name": LinearRegression(),
        "parameters":{
            "fit_intercept":[True,False],
            "normalize":[True,False]
        }
    },
    "Lasso":{
        "name": Lasso(),
        "parameters":{
            "alpha":[0.1,1.0],
            "fit_intercept":[True,False],
            "normalize":[True,False],
            "selection":['cyclic','random']
        }
    },
    "Ridge":{
        "name": Ridge(),
        "parameters":{
            "alpha":[0.1,1.0],
            "fit_intercept":[True,False],
            "normalize":[True,False],
            "solver":['auto','svd','cholesky','lsqr','sparse_cg','sag','saga']
        }
    },
    "SGDRegressor":{
        "name": SGDRegressor(),
        "parameters":{
            "loss":['squared_loss','huber','epsilon_insensitive','squared_epsilon_insensitive'],
            "alpha":[0.00001,0.0001],
            "shuffle":[True,False]
        }
    },
    "AdaBoostRegressor":{
        "name": AdaBoostRegressor(),
        "parameters": {
            "loss": ['linear','square','exponential']
        }
    }
}

#Building models
build_models = build_model(models,X,y)

In [46]:
#Dispalying model performance
for i in build_models:
    model = build_models[i]
    print(f"START: {i}===================\n")
    print(f"Best random_state: {model['random_state']} with best r2_score: {model['initial_r2_score']}\n")
    print(f"Best CV: {model['cv']} with best cross_value_score: {model['cross_val_score']}\n")
    print(f"Best params: {model['gscv'].best_params_}\n")
    print(f"Final Performance:")
    print(f"R2_SCORE: {round(model['r2_score']*100,2)}%\t MSE: {model['mse']}\t RMSE: {model['rmse']}\t MAE: {model['mae']}\n")
    print(f"END: {i}=====================\n\n\n")


Best random_state: 0 with best r2_score: 1.0

Best CV: 2 with best cross_value_score: 1.0

Best params: {'fit_intercept': True, 'normalize': True}

Final Performance:
R2_SCORE: 100.0%	 MSE: 1.4193500951098533e-27	 RMSE: 3.767426303340058e-14	 MAE: 2.7462476737127874e-14





Best random_state: 68 with best r2_score: 0.9998687947497263

Best CV: 3 with best cross_value_score: 0.9999976067082401

Best params: {'alpha': 0.1, 'fit_intercept': False, 'normalize': True, 'selection': 'cyclic'}

Final Performance:
R2_SCORE: 100.0%	 MSE: 0.010201366662749034	 RMSE: 0.10100181514581327	 MAE: 0.08166938672089813





Best random_state: 35 with best r2_score: 0.9998900355757279

Best CV: 7 with best cross_value_score: 0.9999977487970041

Best params: {'alpha': 0.1, 'fit_intercept': False, 'normalize': True, 'solver': 'lsqr'}

Final Performance:
R2_SCORE: 100.0%	 MSE: 0.006008718772691862	 RMSE: 0.07751592592939764	 MAE: 0.06309896796796301





Best random_state: 60 with best r2_score: 0.99999944