In [197]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [198]:
df = pd.read_csv('Ahmedabad_final.csv', index_col=False)

In [199]:
df = df.drop(columns=['Unnamed: 0'])

In [200]:
df.reset_index(drop=True, inplace=True)

In [201]:
df.head()

Unnamed: 0,T,TM,Tm,H,PP,VV,V,VM,PM2.5
0,19.2,29.4,11.0,47,0.0,4.3,5.9,13.0,94.52
1,23.8,32.0,14.0,41,0.0,4.3,4.4,11.1,135.99
2,22.1,31.0,14.0,57,0.0,3.9,3.3,14.8,178.33
3,22.2,30.0,15.0,57,0.0,4.0,5.6,13.0,139.7
4,22.4,32.0,15.0,44,0.0,4.3,4.3,11.1,88.7


In [202]:
df = df.apply(pd.to_numeric,errors='coerce')

In [203]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 725 entries, 0 to 724
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   T       725 non-null    float64
 1   TM      725 non-null    float64
 2   Tm      725 non-null    float64
 3   H       725 non-null    int64  
 4   PP      725 non-null    float64
 5   VV      725 non-null    float64
 6   V       725 non-null    float64
 7   VM      725 non-null    float64
 8   PM2.5   725 non-null    float64
dtypes: float64(8), int64(1)
memory usage: 51.1 KB


In [204]:
x = df.iloc[:, :-1]
y = df.iloc[:, -1]

In [205]:
from sklearn.linear_model import LinearRegression

In [206]:
linear_reg = LinearRegression()
linear_reg.fit(x, y.values)

LinearRegression()

In [207]:
y_pred = linear_reg.predict(x)

In [208]:
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [209]:
error = np.sqrt(mean_squared_error(y,y_pred))

In [210]:
error

30.746473603080155

In [211]:
from sklearn.tree import DecisionTreeRegressor

In [212]:
dec_tree_reg = DecisionTreeRegressor(random_state=0)
dec_tree_reg.fit(x,y.values)
dec_tree_reg.get_params()

{'ccp_alpha': 0.0,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'random_state': 0,
 'splitter': 'best'}

In [213]:
y_pred = dec_tree_reg.predict(x)

In [214]:
error = np.sqrt(mean_squared_error(y,y_pred))

In [215]:
error

6.978797939856464

In [122]:
dec_tree_reg

DecisionTreeRegressor(random_state=0)

In [216]:
data = {"model":dec_tree_reg, "T":df['T'], 'TM':df['TM'], 'Tm':df["Tm"], 'H':df["H"], 'PP':df["PP"], 'VV':df["VV"], 'V':df["V"], 'VM':df["VM"]}
with open ('ahmedabad.pkl', 'wb') as file:
    pickle.dump(data, file)

In [217]:
with open('ahmedabad.pkl', 'rb') as file:
    data = pickle.load(file)

In [218]:
regressor_loaded = data['model']

In [219]:
regressor_loaded.predict(x)

array([ 94.52     , 135.99     , 178.33     , 139.7      ,  88.7      ,
        74.28     , 113.93     , 103.36     , 177.33     , 113.25     ,
       107.99     , 140.29     , 239.96     ,  52.99     ,  61.12     ,
        60.78     ,  91.83     , 105.45     ,  71.56     , 119.8      ,
        97.34     ,  83.92     ,  49.51     , 105.06     ,  94.1      ,
       111.89     , 165.07     , 166.85     , 147.38     , 133.3      ,
        96.04     ,  77.69     ,  53.07     , 138.63     , 144.36     ,
        85.03     , 111.86     ,  58.37     ,  95.37     ,  77.1653125,
        77.1653125,  77.1653125,  77.1653125,  77.1653125,  80.9      ,
       103.29     , 107.5      ,  77.1653125,  96.65     , 126.77     ,
       106.15     ,  51.03     ,  38.73     ,  61.12     , 100.92     ,
        97.48     , 129.27     ,  53.24     ,  42.       ,  50.1      ,
        70.96     ,  53.66     ,  45.49     ,  57.59     ,  77.22     ,
        45.29     , 103.78     ,  80.69     ,  44.75     ,  38.1

In [222]:
x = np.array([[99,29.4343, 11.0,47,0.0,4.3232,5.923,13.0]])

In [223]:
regressor_loaded.predict(x)

array([129.27])

In [123]:
from sklearn.ensemble import RandomForestRegressor

In [124]:
random_forest_reg = RandomForestRegressor(random_state=0)
random_forest_reg.fit(x,y.values)

RandomForestRegressor(random_state=0)

In [125]:
y_pred = random_forest_reg.predict(x)

In [126]:
error = np.sqrt(mean_squared_error(y,y_pred))

In [127]:
error

12.751625852984448

In [128]:
from sklearn.model_selection import GridSearchCV

In [129]:
max_depth = [None, 2,4,6,8,10,12]
parameters = {"max_depth":max_depth}

regressor = DecisionTreeRegressor(random_state=0)
gs = GridSearchCV(regressor, parameters, scoring='neg_mean_squared_error')
gs.fit(x,y.values)

GridSearchCV(estimator=DecisionTreeRegressor(random_state=0),
             param_grid={'max_depth': [None, 2, 4, 6, 8, 10, 12]},
             scoring='neg_mean_squared_error')

In [130]:
regressor = gs.best_estimator_

In [131]:
regressor.fit(x,y.values)

DecisionTreeRegressor(max_depth=2, random_state=0)

In [132]:
y_pred = regressor.predict(x)

In [133]:
error = np.sqrt(mean_squared_error(y,y_pred))
error

31.326441761739005

In [134]:
regressor

DecisionTreeRegressor(max_depth=2, random_state=0)

In [135]:
df.columns

Index(['T', 'TM', 'Tm', 'H', 'PP', 'VV', 'V', 'VM', 'PM2.5'], dtype='object')

In [136]:
import pickle

In [137]:
from sklearn.preprocessing import LabelEncoder

In [138]:
T = LabelEncoder()

In [139]:
df['T'] = T.fit_transform(df['T'])

In [140]:
df.head()

Unnamed: 0,T,TM,Tm,H,PP,VV,V,VM,PM2.5
0,25,29.4,11.0,47,0.0,4.3,5.9,13.0,94.52
1,61,32.0,14.0,41,0.0,4.3,4.4,11.1,135.99
2,48,31.0,14.0,57,0.0,3.9,3.3,14.8,178.33
3,49,30.0,15.0,57,0.0,4.0,5.6,13.0,139.7
4,50,32.0,15.0,44,0.0,4.3,4.3,11.1,88.7


In [141]:
df['T'].unique()

array([ 25,  61,  48,  49,  50,  39,  55,  69,  76,  73,  78,  80,  53,
        31,  28,  42,  91,  63,  72,  81,  95, 116, 120, 146, 155, 157,
       117, 126, 131, 130, 149,  92, 113, 121,   0, 142, 161, 172, 170,
       171, 162, 186, 147, 180, 178, 166, 176, 174, 167, 164, 154, 188,
       152, 141, 139, 145, 119, 132, 136, 127, 128, 140, 151, 133, 122,
       114, 102,  77,  68, 104, 111, 106, 101, 112, 125, 124, 129, 134,
       137,  88, 105, 135, 123,  70,  97, 100,  89,  74, 144, 110, 115,
       108,  94,  90,  87,  93,  86,  99, 107, 109,  98,  96,  71,  85,
        83,  62,  57,  67,  59,  46,  40,  56,  60,  44,  33,  36,  38,
        47,  43,  32,  37,  34,  30,  27,  22,  51,  23,  29,  58,  65,
        52,  41,  54, 143, 138, 150, 159, 165, 158, 173, 183, 182, 177,
       185, 179, 189, 190, 187, 175, 148,  84, 118, 103,  64,  18,  15,
        13,  11,  17,  21,   9,   5,  12,  19,  26,   4,  14,   3,  20,
         2,  79, 163, 156, 153, 160, 169, 181, 184,  75,  82,  3

In [142]:
df.sample()

Unnamed: 0,T,TM,Tm,H,PP,VV,V,VM,PM2.5
180,106,32.5,24.6,73,0.0,4.8,11.3,18.3,35.69


In [143]:
TM = LabelEncoder()
df['TM'] = TM.fit_transform(df['TM'])

In [144]:
Tm = LabelEncoder()
df['Tm'] = Tm.fit_transform(df['Tm'])

In [145]:
H = LabelEncoder()
df['H'] = H.fit_transform(df['H'])

In [146]:
PP = LabelEncoder()
df['PP'] = PP.fit_transform(df['PP'])

In [147]:
VV = LabelEncoder()
df['VV'] = VV.fit_transform(df['VV'])

In [148]:
V = LabelEncoder()
df['V'] = V.fit_transform(df['V'])

In [149]:
VM = LabelEncoder()
df['VM'] = VM.fit_transform(df['VM'])

In [150]:
data = {"model":dec_tree_reg, "T":T, 'TM':TM, 'Tm':Tm, 'H':H, 'PP':PP, 'VV':VV, 'V':V, 'VM':VM}
with open ('ahmedabad.pkl', 'wb') as file:
    pickle.dump(data, file)

In [151]:
with open('ahmedabad.pkl', 'rb') as file:
    data = pickle.load(file)

In [152]:
df.sample()

Unnamed: 0,T,TM,Tm,H,PP,VV,V,VM,PM2.5
663,45,32,27,23,0,14,14,6,51.32


In [153]:
regressor_loaded = data['model']
T = data['T']
TM = data['TM']
Tm = data['Tm']
H = data['H']
PP = data['PP']
VV = data['VV']
V = data['V']
VM = data['VM']

In [154]:
y_pred = regressor_loaded.predict(x)

In [155]:
y_pred

array([ 94.52     , 135.99     , 178.33     , 139.7      ,  88.7      ,
        74.28     , 113.93     , 103.36     , 177.33     , 113.25     ,
       107.99     , 140.29     , 239.96     ,  52.99     ,  61.12     ,
        60.78     ,  91.83     , 105.45     ,  71.56     , 119.8      ,
        97.34     ,  83.92     ,  49.51     , 105.06     ,  94.1      ,
       111.89     , 165.07     , 166.85     , 147.38     , 133.3      ,
        96.04     ,  77.69     ,  53.07     , 138.63     , 144.36     ,
        85.03     , 111.86     ,  58.37     ,  95.37     ,  77.1653125,
        77.1653125,  77.1653125,  77.1653125,  77.1653125,  80.9      ,
       103.29     , 107.5      ,  77.1653125,  96.65     , 126.77     ,
       106.15     ,  51.03     ,  38.73     ,  61.12     , 100.92     ,
        97.48     , 129.27     ,  53.24     ,  42.       ,  50.1      ,
        70.96     ,  53.66     ,  45.49     ,  57.59     ,  77.22     ,
        45.29     , 103.78     ,  80.69     ,  44.75     ,  38.1

In [156]:
error = np.sqrt(mean_squared_error(y,y_pred))

In [157]:
error

6.978797939856464

In [158]:
df.head()

Unnamed: 0,T,TM,Tm,H,PP,VV,V,VM,PM2.5
0,25,29,14,32,0,15,28,7,94.52
1,61,43,36,26,0,15,20,6,135.99
2,48,39,36,42,0,12,14,9,178.33
3,49,32,44,42,0,13,26,7,139.7
4,50,43,44,29,0,15,19,6,88.7


In [159]:
df.tail()

Unnamed: 0,T,TM,Tm,H,PP,VV,V,VM,PM2.5
720,131,67,152,54,0,16,70,12,24.28
721,137,75,153,56,6,16,56,10,32.56
722,150,75,162,48,0,12,50,11,36.48
723,149,75,164,49,0,15,62,13,62.12
724,145,71,158,53,0,14,69,13,37.63


In [160]:
df.shape

(725, 9)

In [194]:
x = np.array([[19.232,29.4343, 11.0,47,0.0,4.3232,5.923,13.0]])

In [195]:
x.dtype

dtype('float64')

In [196]:
x[:, 0] = T.transform(x[:,0])
x[:, 1] = TM.transform(x[:,1])
x[:, 2] = Tm.transform(x[:,2])
x[:, 3] = H.transform(x[:,3])
x[:, 4] = PP.transform(x[:,4])
x[:, 5] = VV.transform(x[:,5])
x[:, 6] = V.transform(x[:,6])
x[:, 7] = VM.transform(x[:,7])
x
x=x.astype(float)

ValueError: y contains previously unseen labels: [19.232]

In [193]:
x

array([[25., 29., 14., 32.,  0., 15., 28.,  7.]])