In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score
import joblib

# Data collecting

In [2]:
df = pd.read_csv("weather-data-18-24.csv")

In [3]:
df.head()

Unnamed: 0,day,month,year,humidity,wspd,pressure,dew_point,city,tmin,tmax,tavg
0,1,1,2018,56,13.2,1014.3,2.9,Islamabad,6.8,16.2,11.4
1,2,1,2018,58,10.1,1015.1,4.6,Islamabad,9.3,16.2,12.7
2,3,1,2018,62,9.4,1015.7,4.6,Islamabad,7.6,15.5,11.8
3,4,1,2018,72,10.8,1015.8,5.3,Islamabad,6.8,14.1,10.2
4,5,1,2018,77,8.0,1016.9,6.2,Islamabad,7.2,14.1,10.2


In [4]:
df.tail()

Unnamed: 0,day,month,year,humidity,wspd,pressure,dew_point,city,tmin,tmax,tavg
7666,27,12,2024,46,9.7,1025.5,-8.2,Quetta,-1.1,8.9,2.8
7667,28,12,2024,42,5.1,1024.7,-9.2,Quetta,-1.9,10.0,3.3
7668,29,12,2024,33,9.7,1021.6,-11.0,Quetta,1.1,14.8,6.0
7669,30,12,2024,34,6.8,1020.9,-8.5,Quetta,1.8,14.2,7.2
7670,31,12,2024,41,4.8,1020.2,-3.8,Quetta,4.2,16.5,9.3


In [5]:
print('rows: ', df.shape[0])
print('columns: ', df.shape[1])

rows:  7671
columns:  11


In [6]:
df.isnull().sum()

day          0
month        0
year         0
humidity     0
wspd         0
pressure     0
dew_point    0
city         0
tmin         0
tmax         0
tavg         0
dtype: int64

In [7]:
df = df.fillna(df.mean)

In [8]:
df.dtypes

day            int64
month          int64
year           int64
humidity       int64
wspd         float64
pressure     float64
dew_point    float64
city          object
tmin         float64
tmax         float64
tavg         float64
dtype: object

In [9]:
np.sum(pd.isnull(df))

  return reduction(axis=axis, out=out, **passkwargs)


day          0
month        0
year         0
humidity     0
wspd         0
pressure     0
dew_point    0
city         0
tmin         0
tmax         0
tavg         0
dtype: int64

In [10]:
df["city"].unique()

array(['Islamabad', 'Lahore', 'Quetta'], dtype=object)

# Data Preprocessing

In [11]:
df["city"] = df["city"].fillna(df["city"].mode()[0])

In [12]:
df = pd.get_dummies(df, columns=["city"])

In [13]:
df

Unnamed: 0,day,month,year,humidity,wspd,pressure,dew_point,tmin,tmax,tavg,city_Islamabad,city_Lahore,city_Quetta
0,1,1,2018,56,13.2,1014.3,2.9,6.8,16.2,11.4,True,False,False
1,2,1,2018,58,10.1,1015.1,4.6,9.3,16.2,12.7,True,False,False
2,3,1,2018,62,9.4,1015.7,4.6,7.6,15.5,11.8,True,False,False
3,4,1,2018,72,10.8,1015.8,5.3,6.8,14.1,10.2,True,False,False
4,5,1,2018,77,8.0,1016.9,6.2,7.2,14.1,10.2,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7666,27,12,2024,46,9.7,1025.5,-8.2,-1.1,8.9,2.8,False,False,True
7667,28,12,2024,42,5.1,1024.7,-9.2,-1.9,10.0,3.3,False,False,True
7668,29,12,2024,33,9.7,1021.6,-11.0,1.1,14.8,6.0,False,False,True
7669,30,12,2024,34,6.8,1020.9,-8.5,1.8,14.2,7.2,False,False,True


In [14]:
df.dtypes

day                 int64
month               int64
year                int64
humidity            int64
wspd              float64
pressure          float64
dew_point         float64
tmin              float64
tmax              float64
tavg              float64
city_Islamabad       bool
city_Lahore          bool
city_Quetta          bool
dtype: object

In [15]:
x = df.iloc[:, 0:-1]
print(x.shape)
y = df.iloc[:,-1]
print(y.shape)

(7671, 12)
(7671,)


# Data splitting

In [16]:
features= ['day',	'month','year',	'humidity',	'wspd',	'pressure' ,'dew_point','city_Islamabad','city_Lahore',	'city_Quetta']
targets = ['tmin',	'tmax','tavg']
y = df[['tmin', 'tmax', 'tavg']]
X = df[features]
X_train, X_test, y_train, y_test = train_test_split( X,y, test_size=0.2, random_state=42)


# Model Training

In [17]:
models = {
    "RandomForest": MultiOutputRegressor(RandomForestRegressor()),
    "GradientBoosting": MultiOutputRegressor(GradientBoostingRegressor()),
    "SVR": MultiOutputRegressor(SVR())
}


best_score = 999
best_model = None
scores = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    score = r2_score(y_test, preds)
    scores[name] = score

    if score > best_score:
        best_score = score
        best_model = model

print("Model Scores:", scores)
print("Best Model:", max(scores, key=scores.get))

print("Model Scores:", scores)
print("Best Model:", max(scores, key=scores.get))

Model Scores: {'RandomForest': 0.9789913732518873, 'GradientBoosting': 0.9663418780369569, 'SVR': 0.04062122667631348}
Best Model: RandomForest
Model Scores: {'RandomForest': 0.9789913732518873, 'GradientBoosting': 0.9663418780369569, 'SVR': 0.04062122667631348}
Best Model: RandomForest


# Saving the best Model 

In [18]:

joblib.dump(best_model, "temperature_predictor_v.pkl")
print("Best model saved as best_weather_model.pkl")

Best model saved as best_weather_model.pkl
