**import Libraries**

In [22]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import resample
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle

**Data Collection and Processing**

In [39]:
data = pd.read_csv("weather_data.csv")
# remove extra spaces in each columnabs
data.columns.str.strip()
print("Data Info:")
data.info()

Data Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311 entries, 0 to 310
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   date             311 non-null    object 
 1   avg_temperature  296 non-null    float64
 2   humidity         296 non-null    float64
 3   avg_wind_speed   296 non-null    float64
 4   rain_or_not      311 non-null    object 
 5   cloud_cover      296 non-null    float64
 6   pressure         311 non-null    float64
dtypes: float64(5), object(2)
memory usage: 17.1+ KB


In [40]:
# drop the unnessasary column date
data = data.drop(columns=['date'])
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 311 entries, 0 to 310
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   avg_temperature  296 non-null    float64
 1   humidity         296 non-null    float64
 2   avg_wind_speed   296 non-null    float64
 3   rain_or_not      311 non-null    object 
 4   cloud_cover      296 non-null    float64
 5   pressure         311 non-null    float64
dtypes: float64(5), object(1)
memory usage: 14.7+ KB


Unnamed: 0,avg_temperature,humidity,avg_wind_speed,rain_or_not,cloud_cover,pressure
0,23.745401,46.140905,7.845981,Rain,20.851051,992.965681
1,30.030503,59.876587,5.382457,Rain,93.059521,1037.273025
2,28.365224,51.464618,13.158008,Rain,11.63664,1034.193357
3,27.550929,53.103799,5.886677,Rain,81.744971,968.610142
4,23.639303,57.826186,12.248992,Rain,38.062329,1030.264331


In [42]:
# check for number of missing values
data.isnull().sum()

avg_temperature    15
humidity           15
avg_wind_speed     15
rain_or_not         0
cloud_cover        15
pressure            0
dtype: int64

In [43]:
def handle_missing_values(data):
    for col in ['avg_temperature', 'humidity', 'avg_wind_speed', 'cloud_cover', 'pressure']:
        data[col] = pd.to_numeric(data[col], errors='coerce')
        if data[col].isnull().sum() > 0:
            data[col].fillna(data[col].median(), inplace=True)
    return data

handle_missing_values(data)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data[col].fillna(data[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are settin

Unnamed: 0,avg_temperature,humidity,avg_wind_speed,rain_or_not,cloud_cover,pressure
0,23.745401,46.140905,7.845981,Rain,20.851051,992.965681
1,30.030503,59.876587,5.382457,Rain,93.059521,1037.273025
2,28.365224,51.464618,13.158008,Rain,11.636640,1034.193357
3,27.550929,53.103799,5.886677,Rain,81.744971,968.610142
4,23.639303,57.826186,12.248992,Rain,38.062329,1030.264331
...,...,...,...,...,...,...
306,22.072834,37.756910,4.729355,No Rain,27.307081,954.564875
307,19.645885,42.190604,8.092369,No Rain,51.807877,964.002410
308,23.884496,36.205508,11.860847,Rain,29.872557,977.681431
309,18.168418,34.785000,4.781288,No Rain,94.067925,1047.153269


In [48]:
# map Rain to 1 and No Rain to 0
data['rain_or_not'] = data['rain_or_not'].map({"Rain" : 1, "No Rain" : 0})
data['rain_or_not'].unique()

array([1, 0])

**Exploratory Data Analysis (EDA)**

In [50]:
data.shape

(311, 6)

In [51]:
sns.set(style="whitegrid")
