#### Imporing Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn import linear_model
from sklearn.impute import SimpleImputer
from sklearn.ensemble import IsolationForest
from sklearn.metrics import r2_score
from sklearn.metrics import max_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_squared_log_error
from datetime import datetime

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def generate_model_report(y_actual, y_predicted):
    print("r2 score = ", r2_score(y_actual, y_predicted))
    print("Max Error = ", max_error(y_actual, y_predicted))
    print("Mean Absolute Error = ", mean_absolute_error(y_actual, y_predicted))
    print("Mean Squared Error = ", mean_squared_error(y_actual, y_predicted))

In [4]:
train =pd.read_excel("Train_dataset.xlsx")
test = pd.read_excel("Test_dataset.xlsx")
time_series = pd.read_excel("Test_dataset.xlsx",sheet_name="Foreign_Visitors_TS")

In [5]:
train.head()

Unnamed: 0,City,State,Type,Population [2011],Popuation [2001],Sex Ratio,Median Age,Avg Temp,SWM,Toilets Avl,Water Purity,H Index,Female Population,# of hospitals,Foreign Visitors,Covid Cases
0,Mumbai,Maharashtra,M.C,12442373.0,11978450.0,878.0,23.0,32.0,MEDIUM,219.0,150.0,0.70044,10924403.0,159.0,4408916.0,163115
1,Delhi,Delhi,M.C,11007835.0,9879172.0,858.0,27.0,30.0,MEDIUM,215.0,196.0,0.920018,9444722.0,148.0,2379169.0,80188
2,Bangalore,Karnataka,MPUA,8436675.0,4301326.0,936.0,28.0,37.0,HIGH,212.0,102.0,0.097085,7896728.0,123.0,636502.0,141000
3,Hyderabad,Telangana,MPUA,6809970.0,3637483.0,930.0,23.0,31.0,MEDIUM,217.0,118.0,0.827744,6333272.0,110.0,126078.0,55123
4,Ahmedabad,Gujarat,MPUA,5570585.0,3520085.0,852.0,29.0,25.0,LOW,227.0,109.0,0.847941,4746138.0,73.0,284973.0,33204


In [6]:
train_city = train['City']
del train['City']

y = train['Covid Cases']
del train['Covid Cases']

In [7]:
train['Type'] = train.apply(lambda x: x['Type'].lower().replace('.','').replace(' ',''),axis=1)
train['Type'].unique()

array(['mc', 'mpua', 't', 'm', 'c-1t', 'mcl', 'mcorp', 'np', 'ua', 'mb',
       'npd', 'npp', 'cmc', 'ct', 'nagarparishad', 'nac', 'tc', 'tmc',
       'nt', 'cb', 'tp', 'na'], dtype=object)

In [8]:
def mi_table(df):
        zero_val = (df == 0.00).astype(int).sum(axis=0)
        mis_val = df.isnull().sum()
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        mz_table = pd.concat([zero_val, mis_val, mis_val_percent], axis=1)
        mz_table = mz_table.rename(
        columns = {0 : 'Zero Values', 1 : 'Missing Values', 2 : '% of Total Values'})
        mz_table['Total Zero Missing Values'] = mz_table['Zero Values'] + mz_table['Missing Values']
        mz_table['% Total Zero Missing Values'] = 100 * mz_table['Total Zero Missing Values'] / len(df)
        mz_table['Data Type'] = df.dtypes
        mz_table = mz_table[
            mz_table.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns and " + str(df.shape[0]) + " Rows.\n"      
            "There are " + str(mz_table.shape[0]) +
              " columns that have missing values.")
#         mz_table.to_excel('D:/sampledata/missing_and_zero_values.xlsx', freeze_panes=(1,0), index = False)
        return mz_table

mi_table(train)

Your selected dataframe has 14 columns and 787 Rows.
There are 12 columns that have missing values.


Unnamed: 0,Zero Values,Missing Values,% of Total Values,Total Zero Missing Values,% Total Zero Missing Values,Data Type
Popuation [2001],0,492,62.5,492,62.5,float64
Water Purity,0,158,20.1,158,20.1,float64
Female Population,0,141,17.9,141,17.9,float64
H Index,0,140,17.8,140,17.8,float64
Foreign Visitors,0,90,11.4,90,11.4,float64
Population [2011],0,48,6.1,48,6.1,float64
Toilets Avl,0,26,3.3,26,3.3,float64
Median Age,0,18,2.3,18,2.3,float64
Avg Temp,0,17,2.2,17,2.2,float64
# of hospitals,0,15,1.9,15,1.9,float64


In [9]:
train['SWM'].replace(np.nan,'Unknown', inplace=True)
missing_columns = ['Sex Ratio', '# of hospitals', 'Avg Temp','Median Age', 'Toilets Avl']
for i in missing_columns:
    train[i] = train[i].fillna((train[i].mean()))

#### Seperating Numerical and Object type Data

In [10]:
train_num = train.select_dtypes(include=["float64", "int64"]).copy()
train_obj = train.select_dtypes(include ="object").copy()

In [11]:
for i in train_obj.columns:
    train_obj[i]=train_obj[i].astype('category')

In [12]:
train_obj_onehot = train_obj.copy()
for i in train_obj.columns:
    train_obj_onehot = pd.get_dummies(train_obj_onehot, columns=[i], prefix_sep="_|_")

#### Feature Scaling and Imputation of missing value

In [13]:
scaler = MinMaxScaler()
train_num = pd.DataFrame(scaler.fit_transform(train_num), columns = train_num.columns)

In [14]:
pop = train_num['Popuation [2001]']
del train_num['Popuation [2001]']

In [15]:
pop = pop.fillna(-999.0)
pop =pd.DataFrame(pop)

In [16]:
imp = KNNImputer(n_neighbors=6)
train_num = pd.DataFrame(imp.fit_transform(train_num), columns = train_num.columns)

In [17]:
train_final = pd.concat([train_num,train_obj_onehot], axis=1, sort=False)

In [18]:
mi_table(train_final)

Your selected dataframe has 69 columns and 787 Rows.
There are 0 columns that have missing values.


Unnamed: 0,Zero Values,Missing Values,% of Total Values,Total Zero Missing Values,% Total Zero Missing Values,Data Type


#### Imputing Population[2001] feature by Regression model

In [19]:
Train_X = pd.DataFrame()
Test_X = pd.DataFrame()
Train_Y = pd.DataFrame()
Test_Y = pd.DataFrame()
for i in range(787):
    if(pop['Popuation [2001]'][i]==-999.0):
        Test_Y = Test_Y.append(pop.iloc[[i]])
        Test_X = Test_X.append(train_final.iloc[[i]])
    else:
        Train_Y = Train_Y.append(pop.iloc[[i]])
        Train_X= Train_X.append(train_final.iloc[[i]])
# Train_Y = pd.DataFrame(Train_Y)
# Train_Y.columns = ['Popuation [2001]']
X_train, X_test, Y_train, Y_test = train_test_split(Train_X, Train_Y, test_size =0.2, random_state=100)
rid = Ridge()
regp = rid.fit(X_train, Y_train)
Y_pred = regp.predict(X_test)
generate_model_report(Y_test, Y_pred)
regp = rid.fit(Train_X, Train_Y)
Y = regp.predict(Test_X)
Y = pd.DataFrame(Y)
# index = Test_X.index
Y.index = Test_X.index
Y.columns = ['Popuation [2001]']

r2 score =  0.9698889264455219
Max Error =  0.08120265379073421
Mean Absolute Error =  0.011133738283602083
Mean Squared Error =  0.0003625497069412506


In [20]:
Y_train.describe()

Unnamed: 0,Popuation [2001]
count,236.0
mean,0.041346
std,0.08346
min,0.0
25%,0.011817
50%,0.018816
75%,0.038821
max,1.0


In [21]:
X_train = X_train.append(X_test)
Y_train = Y_train.append(Y_test)
X_train = X_train.append(Test_X)
Y_train = Y_train.append(Y)
train_final = X_train
train_final['Popuation [2001]'] = Y_train

In [22]:
train_final = train_final.sort_index()
train_final

Unnamed: 0,Population [2011],Sex Ratio,Median Age,Avg Temp,Toilets Avl,Water Purity,H Index,Female Population,# of hospitals,Foreign Visitors,...,Type_|_t,Type_|_tc,Type_|_tmc,Type_|_tp,Type_|_ua,SWM_|_HIGH,SWM_|_LOW,SWM_|_MEDIUM,SWM_|_Unknown,Popuation [2001]
0,1.000000,0.267857,0.000000,0.771429,0.954802,0.50,0.699675,1.000000,1.000000,0.941119,...,0,0,0,0,0,0,0,1,0,1.000000
1,0.884364,0.178571,0.444444,0.714286,0.932203,0.96,0.919980,0.864168,0.926174,0.507775,...,0,0,0,0,0,0,0,1,0,0.824315
2,0.677106,0.526786,0.555556,0.914286,0.915254,0.02,0.094322,0.722066,0.758389,0.135721,...,0,0,0,0,0,1,0,0,0,0.357514
3,0.545979,0.500000,0.000000,0.742857,0.943503,0.18,0.827401,0.578544,0.671141,0.026747,...,0,0,0,0,0,0,0,1,0,0.301958
4,0.446074,0.151786,0.666667,0.571429,1.000000,0.09,0.847665,0.432848,0.422819,0.060670,...,0,0,0,0,0,0,1,0,0,0.292133
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
782,0.000014,0.125000,0.000000,0.657143,0.101695,0.03,0.032313,0.000032,0.060403,0.060670,...,0,0,0,0,0,0,0,1,0,0.105189
783,0.000014,0.566964,0.000000,0.885714,0.067797,0.36,0.053496,0.000367,0.080537,0.317833,...,0,0,0,0,0,0,0,1,0,0.113886
784,0.000004,0.897321,0.555556,0.657143,0.271186,0.38,0.063889,0.000607,0.114094,0.022493,...,0,0,0,0,0,1,0,0,0,0.118290
785,0.000002,0.366071,0.111111,0.028571,0.073446,0.45,0.626549,0.000203,0.033557,0.005107,...,0,0,0,0,0,0,0,1,0,0.113997


In [23]:
train_final.shape

(787, 70)

In [24]:
X_train, X_test, Y_train, Y_test = train_test_split(train_final, y, test_size =0.3, random_state=100)

In [25]:
rdm = RandomForestRegressor(random_state=10)
mod_r = rdm.fit(X_train, Y_train)
Y_pred = mod_r.predict(X_test)
generate_model_report(Y_test, Y_pred)

r2 score =  0.42163879034312624
Max Error =  41218.53
Mean Absolute Error =  1699.929789029536
Mean Squared Error =  23137616.660925742


In [26]:
mod_r = rdm.fit(train_final,y)

In [27]:
lr = LogisticRegression()
model = lr.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
generate_model_report(Y_test, Y_pred)

r2 score =  -7.910530421781447
Max Error =  131605
Mean Absolute Error =  4379.641350210971
Mean Squared Error =  356470028.9409283


In [28]:
las = Lasso()
model = las.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
generate_model_report(Y_test, Y_pred)

r2 score =  0.15133943340509126
Max Error =  42101.90613194872
Mean Absolute Error =  3476.9450692183964
Mean Squared Error =  33951071.64390666


In [29]:
ri = Ridge()
mod = ri.fit(X_train, Y_train)
Y_pred = mod.predict(X_test)
generate_model_report(Y_test, Y_pred)

r2 score =  0.30587101261269334
Max Error =  26610.07058616534
Mean Absolute Error =  3383.77098969499
Mean Squared Error =  27768961.948420294


#### Preprocessing of Test Data

In [30]:
test.head()

Unnamed: 0,City,State,Type,Population [2011],Popuation [2001],Sex Ratio,Median Age,Avg Temp,SWM,Toilets Avl,Water Purity,H Index,Female Population,# of hospitals,Foreign Visitors
0,Tuensang,Nagaland,T.C,36774.0,,931.0,23.0,10.0,MEDIUM,94.0,114.0,0.25339,34237.0,17.0,2769.0
1,Lakshmeshwar,Karnataka,T.M.C,36754.0,,934.0,25.0,38.0,HIGH,62.0,160.0,0.192555,34328.0,13.0,636502.0
2,Zira,Punjab,M.Cl.,36732.0,,883.0,29.0,35.0,HIGH,63.0,105.0,0.887882,32434.0,17.0,242367.0
3,Yawal,Maharashtra,M.Cl,36706.0,,887.0,26.0,31.0,HIGH,60.0,174.0,0.407838,32558.0,11.0,4408916.0
4,Thana Bhawan,Uttar Pradesh,N.P.,36669.0,,877.0,28.0,39.0,LOW,92.0,153.0,0.324456,32159.0,23.0,3104060.0


In [31]:
Test_city = test['City']
del test['City']

In [32]:
test['Type'] = train.apply(lambda x: x['Type'].lower().replace('.','').replace(' ',''),axis=1)
test['Type'].unique()

array(['mc', 'mpua', 't', 'm', 'c-1t', 'mcl', 'mcorp', 'np', 'ua', 'mb',
       'npd', 'npp', 'cmc', 'ct', 'nagarparishad', 'nac'], dtype=object)

#### Finding Missing Value

In [33]:
mi_table(test)

Your selected dataframe has 14 columns and 501 Rows.
There are 12 columns that have missing values.


Unnamed: 0,Zero Values,Missing Values,% of Total Values,Total Zero Missing Values,% Total Zero Missing Values,Data Type
Popuation [2001],0,501,100.0,501,100.0,float64
Toilets Avl,0,22,4.4,22,4.4,float64
Water Purity,0,19,3.8,19,3.8,float64
# of hospitals,0,17,3.4,17,3.4,float64
Foreign Visitors,0,17,3.4,17,3.4,float64
H Index,0,15,3.0,15,3.0,float64
Female Population,1,15,3.0,16,3.2,float64
Avg Temp,0,14,2.8,14,2.8,float64
Median Age,0,13,2.6,13,2.6,float64
SWM,0,9,1.8,9,1.8,object


In [34]:
test['SWM'].replace(np.nan,'Unknown', inplace=True)

In [35]:
test['Sex Ratio'] = test['Sex Ratio'].fillna((test['Sex Ratio'].mean()))
test['# of hospitals'] = test['# of hospitals'].fillna((test['# of hospitals'].mean()))
test['Avg Temp'] = test['Avg Temp'].fillna((test['Avg Temp'].mean()))
test['Median Age'] = test['Median Age'].fillna((test['Median Age'].mean()))
test['Toilets Avl'] = test['Toilets Avl'].fillna((test['Toilets Avl'].mean()))

#### Seperating Numerical and Object type Data

In [36]:
test_num = test.select_dtypes(include=["float64", "int64"]).copy()
test_obj = test.select_dtypes(include ="object").copy()

In [37]:
for i in test_obj.columns:
    test_obj[i]=test_obj[i].astype('category')

In [38]:
test_obj_onehot = test_obj.copy()
for i in train_obj.columns:
    test_obj_onehot = pd.get_dummies(test_obj_onehot, columns=[i], prefix_sep="_|_")

In [39]:
test_num.head()

Unnamed: 0,Population [2011],Popuation [2001],Sex Ratio,Median Age,Avg Temp,Toilets Avl,Water Purity,H Index,Female Population,# of hospitals,Foreign Visitors
0,36774.0,,931.0,23.0,10.0,94.0,114.0,0.25339,34237.0,17.0,2769.0
1,36754.0,,934.0,25.0,38.0,62.0,160.0,0.192555,34328.0,13.0,636502.0
2,36732.0,,883.0,29.0,35.0,63.0,105.0,0.887882,32434.0,17.0,242367.0
3,36706.0,,887.0,26.0,31.0,60.0,174.0,0.407838,32558.0,11.0,4408916.0
4,36669.0,,877.0,28.0,39.0,92.0,153.0,0.324456,32159.0,23.0,3104060.0


#### Feature Scaling and Imputation of Test Data

In [40]:
test_num = pd.DataFrame(scaler.transform(test_num), columns = test_num.columns)

In [41]:
del test_num['Popuation [2001]']

In [42]:
test_num = pd.DataFrame(imp.transform(test_num), columns = test_num.columns)
test_num

Unnamed: 0,Population [2011],Sex Ratio,Median Age,Avg Temp,Toilets Avl,Water Purity,H Index,Female Population,# of hospitals,Foreign Visitors
0,-1.612176e-07,0.504464,0.000000,0.142857,0.248588,0.14,0.251145,0.000305,0.046980,0.000421
1,-1.773393e-06,0.517857,0.222222,0.942857,0.067797,0.60,0.190108,0.000313,0.020134,0.135721
2,-3.546786e-06,0.290179,0.666667,0.857143,0.073446,0.05,0.887737,0.000140,0.046980,0.051574
3,-5.642614e-06,0.308036,0.333333,0.742857,0.056497,0.74,0.406104,0.000151,0.006711,0.941119
4,-8.625139e-06,0.263393,0.555556,0.971429,0.237288,0.53,0.322446,0.000114,0.087248,0.662537
...,...,...,...,...,...,...,...,...,...,...
496,-2.832189e-03,0.133929,0.444444,0.057143,-0.214689,0.81,0.314891,-0.002710,-0.040268,0.022435
497,-2.842185e-03,0.151786,0.555556,0.200000,-0.192090,0.98,0.334883,-0.002719,-0.026846,0.022435
498,-2.915136e-03,0.156250,0.111111,0.114286,-0.175141,0.89,0.722564,-0.002790,-0.026846,0.022435
499,-2.955601e-03,0.151786,0.444444,0.085714,-0.180791,0.70,0.419371,-0.002829,-0.013423,0.022435


In [43]:
test_final = pd.concat([test_num,test_obj_onehot], axis=1, sort=False)
test_final.shape

(501, 59)

In [44]:
for i in range(10):
    test_final["new"+ str(i)]=0

In [45]:
test_final.head()

Unnamed: 0,Population [2011],Sex Ratio,Median Age,Avg Temp,Toilets Avl,Water Purity,H Index,Female Population,# of hospitals,Foreign Visitors,...,new0,new1,new2,new3,new4,new5,new6,new7,new8,new9
0,-1.612176e-07,0.504464,0.0,0.142857,0.248588,0.14,0.251145,0.000305,0.04698,0.000421,...,0,0,0,0,0,0,0,0,0,0
1,-1.773393e-06,0.517857,0.222222,0.942857,0.067797,0.6,0.190108,0.000313,0.020134,0.135721,...,0,0,0,0,0,0,0,0,0,0
2,-3.546786e-06,0.290179,0.666667,0.857143,0.073446,0.05,0.887737,0.00014,0.04698,0.051574,...,0,0,0,0,0,0,0,0,0,0
3,-5.642614e-06,0.308036,0.333333,0.742857,0.056497,0.74,0.406104,0.000151,0.006711,0.941119,...,0,0,0,0,0,0,0,0,0,0
4,-8.625139e-06,0.263393,0.555556,0.971429,0.237288,0.53,0.322446,0.000114,0.087248,0.662537,...,0,0,0,0,0,0,0,0,0,0


#### Prediction of Population[2001] by Regression Model

In [46]:
test_p01 =Y = regp.predict(test_final)

In [47]:
test_p01 = pd.DataFrame(test_p01)
test_p01.columns = ['Popuation [2001]']
test_final['Popuation [2001]'] = test_p01
test_final.shape

(501, 70)

#### Final Prediction of Covid Cases on September 1

In [48]:
prediction =  mod_r.predict(test_final)
prediction = pd.DataFrame(prediction)
ans=pd.concat([Test_city,prediction],axis=1)
ans.columns=['City', 'Sept Cases']
ans.to_csv('september.csv', index=False)
ans.head(20)

Unnamed: 0,City,Sept Cases
0,Tuensang,2117.23
1,Lakshmeshwar,2324.22
2,Zira,2587.57
3,Yawal,2035.73
4,Thana Bhawan,4269.52
5,Ramdurg,2427.57
6,Pulgaon,1887.27
7,Sadasivpet,2478.63
8,Nargund,2160.85
9,Neem-Ka-Thana,1926.42


In [49]:
time_series.columns = ['City','April','May','June','July','August']
time_series = time_series.drop(time_series.index[0])
time_series.head()

Unnamed: 0,City,April,May,June,July,August
1,Tuensang,1614,1946,2372,2500,2769
2,Lakshmeshwar,369179,445559,541045,572860,636502
3,Zira,140581,169665,206030,218138,242367
4,Yawal,2557179,3086249,3747597,3968032,4408916
5,Thana Bhawan,1800363,2172850,2638469,2793662,3104060


In [50]:
time_series.shape

(501, 6)

#### Regression Analysis on Time Series Data

In [51]:
result=pd.DataFrame()
city=[]
ans=[]
for k in time_series.index:

    temp=pd.DataFrame()    
    li=[]
    date=[]
    for i in time_series.columns:
        if i!="City":
            s = "30 "+ i + ", 2020"
            date.append(datetime.strptime(s, '%d %B, %Y'))

            li.append(time_series[i][k])
    date.append(datetime.strptime('30 September, 2020', '%d %B, %Y'))
    li.append(-999)
    temp['feature']=date
    temp['Y']=li  
    Q = temp.iloc[[-1]]
    Q['feature']=Q['feature'].map(datetime.toordinal)
    temp = temp[:-1]

    temp['Y'] = temp['Y'].fillna((temp['Y'].mean()))
    
    if k==1:
        dx = temp
    X_train = pd.DataFrame()
    Y_train = pd.DataFrame()
    X_train['feature']=temp['feature'].map(datetime.toordinal)
    y_t = temp['Y']
    X_test = X_train.iloc[[-1]]
    X_train = X_train[:-1]
    Y_test = y_t.iloc[[-1]]
    Y_train = y_t[:-1]

    del Q['Y']

    las = LinearRegression()
    model = las.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    r = model.predict(X_test)

    Q_ans = model.predict(Q)
    city.append(time_series['City'][k])
    ans.append(Q_ans[0])
    
result['City']=city
result['Foreign_Visitors_Sept']=ans
result

Unnamed: 0,City,Foreign_Visitors_Sept
0,Tuensang,3.199424e+03
1,Lakshmeshwar,7.321842e+05
2,Zira,2.788088e+05
3,Yawal,5.071598e+06
4,Thana Bhawan,3.570620e+06
...,...,...
496,Nandaprayag,1.218105e+05
497,Kirtinagar,1.218105e+05
498,Kedarnath,1.218105e+05
499,Gangotri,1.218105e+05


In [52]:
dx

Unnamed: 0,feature,Y
0,2020-04-30,1614
1,2020-05-30,1946
2,2020-06-30,2372
3,2020-07-30,2500
4,2020-08-30,2769


#### Training Datasets

In [53]:
mf_X = pd.DataFrame()
mf_X['FV 30 Aug'] = train['Foreign Visitors']
mf_Y = y

In [54]:
imp1 = KNNImputer(n_neighbors=6)
mf_X = pd.DataFrame(imp1.fit_transform(mf_X), columns = mf_X.columns)

In [55]:
ss = MinMaxScaler()
mf_X = pd.DataFrame(ss.fit_transform(mf_X), columns = mf_X.columns)

In [56]:
mf_X

Unnamed: 0,FV 30 Aug
0,0.941119
1,0.507775
2,0.135721
3,0.026747
4,0.060670
...,...
782,0.060670
783,0.317833
784,0.022493
785,0.005107


#### Training Dataset

In [57]:
test_mf = pd.DataFrame()
test_mf['FV 30 Sep'] = result['Foreign_Visitors_Sept']
test_mf.shape

(501, 1)

In [58]:
test_mf

Unnamed: 0,FV 30 Sep
0,3.199424e+03
1,7.321842e+05
2,2.788088e+05
3,5.071598e+06
4,3.570620e+06
...,...
496,1.218105e+05
497,1.218105e+05
498,1.218105e+05
499,1.218105e+05


In [59]:
X_train, X_test, Y_train, Y_test = train_test_split(mf_X, mf_Y, test_size =0.3, random_state=10)

In [60]:
rf = RandomForestRegressor( random_state=10)
mr = rf.fit(X_train, Y_train)
Y_pred = mr.predict(X_test)
generate_model_report(Y_test, Y_pred)

r2 score =  0.038755492795142676
Max Error =  135801.20350721496
Mean Absolute Error =  5975.081542892845
Mean Squared Error =  217454138.8435911


In [61]:
mr = rf.fit(mf_X, mf_Y)

In [62]:
rdm = DecisionTreeRegressor()
model = rdm.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
generate_model_report(Y_test, Y_pred)

r2 score =  0.03582275514499389
Max Error =  136351.4358974359
Mean Absolute Error =  5967.3573333418735
Mean Squared Error =  218117587.04577821


In [63]:
las = Lasso()
model = las.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
generate_model_report(Y_test, Y_pred)

r2 score =  0.03857719630300205
Max Error =  139809.0409584776
Mean Absolute Error =  5952.743504716496
Mean Squared Error =  217494473.33691373


In [64]:
ri = Ridge()
l = ri.fit(X_train, Y_train)
Y_pred = l.predict(X_test)
generate_model_report(Y_test, Y_pred)

r2 score =  0.03835150412211119
Max Error =  139879.42472018144
Mean Absolute Error =  5948.108497897015
Mean Squared Error =  217545529.75229138


In [65]:
lr = LinearRegression()
model = lr.fit(X_train, Y_train)
Y_pred = model.predict(X_test)
generate_model_report(Y_test, Y_pred)

r2 score =  0.03859532915790764
Max Error =  139803.2075538695
Mean Absolute Error =  5953.127653978836
Mean Squared Error =  217490371.2959463


In [66]:
test_mf.head()

Unnamed: 0,FV 30 Sep
0,3199.424
1,732184.2
2,278808.8
3,5071598.0
4,3570620.0


In [67]:
test_mf = pd.DataFrame(imp1.transform(test_mf), columns = test_mf.columns)
test_mf = pd.DataFrame(ss.transform(test_mf), columns = test_mf.columns)

In [68]:
Oct = mr.predict(test_mf)
Oct = pd.DataFrame(Oct)
Oct.columns = ['Oct Cases']
Oct

Unnamed: 0,Oct Cases
0,2279.955667
1,7952.982505
2,4466.429249
3,10272.395999
4,5089.685910
...,...
496,4987.377225
497,4987.377225
498,4987.377225
499,4987.377225


In [69]:
sept_cases = pd.read_csv('september.csv')
sept_cases

Unnamed: 0,City,Sept Cases
0,Tuensang,2117.23
1,Lakshmeshwar,2324.22
2,Zira,2587.57
3,Yawal,2035.73
4,Thana Bhawan,4269.52
...,...,...
496,Nandaprayag,2486.42
497,Kirtinagar,3231.07
498,Kedarnath,2244.51
499,Gangotri,2403.85


In [70]:
res = sept_cases
res['Oct Cases'] = Oct['Oct Cases']
res

Unnamed: 0,City,Sept Cases,Oct Cases
0,Tuensang,2117.23,2279.955667
1,Lakshmeshwar,2324.22,7952.982505
2,Zira,2587.57,4466.429249
3,Yawal,2035.73,10272.395999
4,Thana Bhawan,4269.52,5089.685910
...,...,...,...
496,Nandaprayag,2486.42,4987.377225
497,Kirtinagar,3231.07,4987.377225
498,Kedarnath,2244.51,4987.377225
499,Gangotri,2403.85,4987.377225


In [71]:
res['new_oct'] = res.apply(lambda x: x['Oct Cases'] if (x['Oct Cases']-x['Sept Cases']>0) else x['Sept Cases']+abs(x['Oct Cases']-x['Sept Cases']),axis=1)


In [72]:
del res['Sept Cases']
del res['Oct Cases']

#### Final Prediction of Covid Cases on September 1

In [73]:
res.columns = ['City','Oct_Covid']
res.to_csv('october.csv', index=False)
res.head(20)

Unnamed: 0,City,Oct_Covid
0,Tuensang,2279.955667
1,Lakshmeshwar,7952.982505
2,Zira,4466.429249
3,Yawal,10272.395999
4,Thana Bhawan,5089.68591
5,Ramdurg,7952.982505
6,Pulgaon,15360.184526
7,Sadasivpet,4987.377225
8,Nargund,3083.293792
9,Neem-Ka-Thana,8213.07441
