In [1]:
import pandas as pd    # import pandas lib. using it's short form pd
data = pd.read_csv(r'aqi_dataset.csv', header=0)
data.head()

Unnamed: 0,Date,Maximum Wind Speed,Maximum Temperature,Minimum Temperature,Average Temperature,Average Wind Speed,AQI
0,25-07-2023,21.7,21.24,15.87,20.87,20.67,27.0
1,08-03-2023,19.36,17.95,14.28,16.97,18.57,63.0
2,18-03-2023,24.84,23.59,20.56,21.25,22.78,76.0
3,17-07-2023,7.69,19.24,16.82,18.69,5.79,75.0
4,06-05-2023,21.81,32.87,27.8,30.01,20.67,39.0


In [2]:
data.info() # gives information about null values and data type of each column

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11 entries, 0 to 10
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Date                 11 non-null     object 
 1   Maximum Wind Speed   11 non-null     float64
 2   Maximum Temperature  11 non-null     float64
 3   Minimum Temperature  11 non-null     float64
 4   Average Temperature  11 non-null     float64
 5   Average Wind Speed   11 non-null     float64
 6   AQI                  11 non-null     float64
dtypes: float64(6), object(1)
memory usage: 744.0+ bytes


In [3]:
#drops the Date column which is not necessary in prediction
data = data.drop('Date', axis=1)

In [4]:
data.head()

Unnamed: 0,Maximum Wind Speed,Maximum Temperature,Minimum Temperature,Average Temperature,Average Wind Speed,AQI
0,21.7,21.24,15.87,20.87,20.67,27.0
1,19.36,17.95,14.28,16.97,18.57,63.0
2,24.84,23.59,20.56,21.25,22.78,76.0
3,7.69,19.24,16.82,18.69,5.79,75.0
4,21.81,32.87,27.8,30.01,20.67,39.0


In [5]:
data.shape  # no. of rows & columns

(11, 6)

In [6]:
data.isnull().sum()  # there is no null value in our dataset

Maximum Wind Speed     0
Maximum Temperature    0
Minimum Temperature    0
Average Temperature    0
Average Wind Speed     0
AQI                    0
dtype: int64

In [7]:
data.dtypes  # data type of each column 

Maximum Wind Speed     float64
Maximum Temperature    float64
Minimum Temperature    float64
Average Temperature    float64
Average Wind Speed     float64
AQI                    float64
dtype: object

In [8]:
X = data.iloc[:,0:-1]  # define X 
Y = data.iloc[:,-1]   # define target var. Y

In [9]:
X

Unnamed: 0,Maximum Wind Speed,Maximum Temperature,Minimum Temperature,Average Temperature,Average Wind Speed
0,21.7,21.24,15.87,20.87,20.67
1,19.36,17.95,14.28,16.97,18.57
2,24.84,23.59,20.56,21.25,22.78
3,7.69,19.24,16.82,18.69,5.79
4,21.81,32.87,27.8,30.01,20.67
5,21.72,21.44,15.27,20.82,20.47
6,19.56,17.25,14.48,16.27,18.37
7,24.14,23.39,20.86,21.75,22.38
8,7.59,19.14,16.02,18.39,5.19
9,21.11,32.67,27.18,30.81,20.17


In [10]:
Y

0      27.0
1      63.0
2      76.0
3      75.0
4      39.0
5      37.0
6      53.0
7      96.0
8      85.0
9      49.0
10    107.4
Name: AQI, dtype: float64

In [11]:
from sklearn.model_selection import train_test_split # import train_test_split func from model_selection sublib.
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=10) # training and testing data (model can randomly take 10 rows at a time)

In [12]:
print(X_train.shape)
print(X_test.shape)
print(Y_train.shape)
print(Y_test.shape)

(8, 5)
(3, 5)
(8,)
(3,)


In [13]:
from sklearn.ensemble import RandomForestRegressor
model_RandomForest = RandomForestRegressor(n_estimators=100, random_state=10) # 100 decision trees
model_RandomForest.fit(X_train,Y_train) # train the model

RandomForestRegressor(random_state=10)

In [14]:
model_RandomForest.score(X_train,Y_train)

0.895830916289411

In [15]:
Y_pred = model_RandomForest.predict(X_test)  # predict the value of AQI for the given testing data
print(Y_pred)

[79.958 61.558 68.914]


In [16]:
import pickle
pickle.dump(model_RandomForest, open('randomForestRegressor.pkl','wb'))

In [17]:
model = pickle.load(open('randomForestRegressor.pkl','rb'))

In [18]:
model

RandomForestRegressor(random_state=10)

In [19]:
# predict the AQI for [19,17,14,16,18]
print(model.predict([[19,17,14,16,18]]))

[65.47]


