**One Step Ahead Prediction**

In [0]:
# Lib
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV

# Finding RMSE
def ErrorCalc(mdl, ref, tag):
    relError = np.abs(mdl - ref)/ np.abs(ref+1)
    MeanErrorV = np.mean(relError)
    print(tag + ': Mean Rel Error in %: ', MeanErrorV * 100)
    return MeanErrorV

# Since cumulative prediction
def AdjustingErrorsOutliers(tempPred, df) :
    tempPred = np.round(tempPred)
    tempPrev = df['day5'].to_numpy() # Next cumulative prediction must be more than or equal to previous
    for i in range(len(tempPred)):
        if tempPred[i] < tempPrev[i] : # Since cumulative prediction
            tempPred[i] = tempPrev[i]
    return tempPred

# Train model
def TrainMdl (trainIpData, trainOpData, PredictionData) :
    testSize = 0.1 # 90:10 ratio >> for final testing
    #randomState = 42 # For train test split

    print('Training starts ...')

    totalIte = 2

    for iLoop in range(totalIte):

        if iLoop == 0 :
            randomState = 42
        else :
            randomState=None


        # Final validation
        X_train, X_test, y_train, y_test = train_test_split(trainIpData, trainOpData, test_size=testSize, random_state=randomState)

        # Extrating features
        TrainIP = X_train[['diff1', 'diff2', 'diff3', 'diff4', 'tempVal', 'ageVal']]
        TrainOP = X_train['gammaFun']
        TestIP = X_test[['diff1', 'diff2', 'diff3', 'diff4', 'tempVal', 'ageVal']]
        TestOP = X_test['gammaFun']


        # Adaboost Regressor >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
        treeDepth = 10 # Fixed
        mdl = DecisionTreeRegressor(max_depth=treeDepth) # This is fixed
        param_grid = {
        'n_estimators': [10, 50, 100, 250, 500],
        'learning_rate': [0.3, 0.2, 0.1, 0.01, 0.001]
                    }
        regrMdl = AdaBoostRegressor(base_estimator=mdl)
        clf = RandomizedSearchCV(estimator = regrMdl, param_distributions = param_grid,
                                         n_iter = 100, cv = 3, verbose=0, random_state=42, n_jobs = -1)
        clf.fit(TrainIP, TrainOP)


        # Calculating Error >> X_train is a superset of TrainIP
        y_predictedTrain = clf.predict(TrainIP) # Predicting the gamma function
        y_predictedTrain = AdjustingErrorsOutliers(y_predictedTrain * X_train['day5'].to_numpy(), X_train)
        if iLoop == totalIte-1 :
          ErrorCalc(y_predictedTrain, y_train.to_numpy(), 'Train Data-set') # y_predictedTrain converted to numbers

        y_predictedTest = clf.predict(TestIP) # Predicting the gamma function
        y_predictedTest = AdjustingErrorsOutliers(y_predictedTest * X_test['day5'].to_numpy(), X_test)
        if iLoop == totalIte-1 :
          ErrorCalc(y_predictedTest, y_test.to_numpy(), 'Validation Data-set ') # y_predictedTest converted to numbers

        # print('-----------------------------------------------------------')

        # Extrating primary features
        PredictionDataF = PredictionData[['diff1', 'diff2', 'diff3', 'diff4', 'tempVal', 'ageVal']]

        # Prediction
        if iLoop == 0 :
            finalPrediction = clf.predict(PredictionDataF)  # Predicting the gamma function
            tempPred = finalPrediction * PredictionData['day5'].to_numpy()
            y_predictedFinal0 = AdjustingErrorsOutliers(tempPred, PredictionData)
        else :
            finalPrediction = clf.predict(PredictionDataF)  # Predicting the gamma function
            tempPred = finalPrediction * PredictionData['day5'].to_numpy()
            y_predictedFinal = AdjustingErrorsOutliers(tempPred, PredictionData)
            y_predictedFinal0 = y_predictedFinal0 + y_predictedFinal

    y_predictedFinal0 = np.round(y_predictedFinal0 / totalIte)
    return y_predictedFinal0


# Main code starts
# Settings
displayAll = 1
if displayAll == 1 :
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)

# Adjust John Hopkins Dataset
worldCorona = pd.read_csv('https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv')
worldCorona = worldCorona.fillna(0)
worldCorona = worldCorona.drop(['Province/State'], axis=1)
worldCorona['Country'] = worldCorona['Country/Region']
worldCorona = worldCorona.drop(['Country/Region', 'Lat', 'Long'], axis=1)
worldCorona = worldCorona.groupby(['Country']).sum()
worldCorona.to_csv('Hellos.csv')
worldCorona = pd.read_csv('Hellos.csv')

# Total countries in the world
totalCountries = pd.read_csv('https://raw.githubusercontent.com/neilay-khasnabish/COVID-19/master/WorldCountryNames.csv')
print('Countries infected: ', np.shape(worldCorona)[0], '/', np.shape(totalCountries)[0])

# Countrywise temperature
countryTemp = pd.read_csv('https://raw.githubusercontent.com/neilay-khasnabish/COVID-19/master/Temp.csv')

# Countrywise age
countryAge = pd.read_csv('https://raw.githubusercontent.com/neilay-khasnabish/COVID-19/master/MedAge.csv')

# Merging all three
result1 = pd.merge(worldCorona, countryTemp, on='Country').reset_index(drop=True)
result = pd.merge(result1, countryAge, on='Country').reset_index(drop=True)
#print('Final size of merged data (rows equal to number of countries to be processed): ', np.shape(result))
result.to_csv('hellow.csv')

# Creating dataframe for training
[rf, cf] = np.shape(result)
#print('Row', rf, '| Col: ', cf)
df=[]
for i in range(rf): # It scans through the entire row
    iCol = 6 # Start index
    while iCol <= cf-4 :
        dayPredict = result.iloc[i, iCol+1]
        day5 = result.iloc[i, iCol]
        day4 = result.iloc[i, iCol-1]
        day3 = result.iloc[i, iCol-2]
        day2 = result.iloc[i, iCol-3]
        day1 = result.iloc[i, iCol-4]
        diff1 = day5 - day4
        diff2 = day4 - day3
        diff3 = day3 - day2
        diff4  = day2 - day1
        iCol = iCol + 1
        ageVal = result.iloc[i, cf - 1]
        tempVal = result.iloc[i, cf - 2]
        dividen = day5 + 1
        gammaFun = float(dayPredict) / dividen
        data = {'day1': [day1], 'day2': [day2], 'day3': [day3], 'day4': [day4], 'day5': [day5], 'tempVal': [tempVal], 'ageVal': [ageVal],
                'dayPredict': [dayPredict], 'gammaFun': [gammaFun], 'diff1': [diff1], 'diff2': [diff2], 'diff3': [diff3], 'diff4': [diff4]}
        df2 = pd.DataFrame(data)
        df.append(df2)

df = pd.concat(df).reset_index(drop=True)
df = df.fillna(0)
df.to_csv('TrainTest.csv')

# Preparing real-time prediction data
dfP=[]
for i in range(rf): # It scans through the entire row
    day5 = result.iloc[i, cf - 3]
    day4 = result.iloc[i, cf - 4]
    day3 = result.iloc[i, cf - 5]
    day2 = result.iloc[i, cf - 6]
    day1 = result.iloc[i, cf - 7]
    diff1 = day5 - day4
    diff2 = day4 - day3
    diff3 = day3 - day2
    diff4 = day2 - day1
    ageVal = result.iloc[i, cf - 1]
    tempVal = result.iloc[i, cf - 2]
    countryName = result.iloc[i, 0]
    data = {'day1': [day1], 'day2': [day2], 'day3': [day3], 'day4': [day4], 'day5': [day5], 'tempVal': [tempVal], 'ageVal': [ageVal], 'Country': [countryName],
            'diff1': [diff1], 'diff2': [diff2], 'diff3': [diff3], 'diff4': [diff4]}
    df2 = pd.DataFrame(data)
    dfP.append(df2)

dfP = pd.concat(dfP).reset_index(drop=True)
dfP.to_csv('Predict.csv')

df = pd.read_csv('TrainTest.csv')
dfP = pd.read_csv('Predict.csv')
trainIpData = df[['day1', 'day2', 'day3', 'day4', 'day5', 'tempVal', 'ageVal', 'gammaFun', 'diff1', 'diff2', 'diff3', 'diff4']]
trainOpData = df['dayPredict']
PredictionData = dfP[['day1', 'day2', 'day3', 'day4', 'day5', 'tempVal', 'ageVal', 'diff1', 'diff2', 'diff3', 'diff4']]
predictions = TrainMdl (trainIpData, trainOpData, PredictionData)
dfP['NextPredictions'] = predictions
dfP['LatestNumberCases'] = dfP['day5']
dfP[['Country', 'LatestNumberCases', 'NextPredictions']].to_csv('CountryWisePredictions.csv')
pdPredicts = pd.read_csv('CountryWisePredictions.csv').reset_index(drop=True).drop(['Unnamed: 0'], axis = 1)
[rP,cP] = np.shape(pdPredicts)
print(pdPredicts.head(rP))

('Countries infected: ', 185, '/', 195)
Training starts ...




('Train Data-set: Mean Rel Error in %: ', 4.528894470699138)
('Validation Data-set : Mean Rel Error in %: ', 5.714245567770806)
           Country  LatestNumberCases  NextPredictions
0      Afghanistan                906            997.0
1          Albania                539            619.0
2          Algeria               2418           2681.0
3          Andorra                696            829.0
4           Angola                 19             19.0
5        Argentina               2669           2932.0
6          Armenia               1201           1320.0
7        Australia               6522           7158.0
8          Austria              14595          15749.0
9       Azerbaijan               1340           1487.0
10         Bahrain               1740           1882.0
11      Bangladesh               1838           2024.0
12        Barbados                 75             82.0
13         Belarus               4779           5260.0
14         Belgium              36138          

**Playground**

In [0]:
#To be made