In [89]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay

In [90]:
# For dataset: https://www.kaggle.com/datasets/jsphyg/weather-dataset-rattle-package
# import kagglehub
# Download latest version
# path = kagglehub.dataset_download("jsphyg/weather-dataset-rattle-package")
# print("Path to dataset files:", path)

In [91]:
# Load dataset
df = pd.read_csv('weatherAUS.csv')
print(df.head())


         Date Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
0  2008-12-01   Albury     13.4     22.9       0.6          NaN       NaN   
1  2008-12-02   Albury      7.4     25.1       0.0          NaN       NaN   
2  2008-12-03   Albury     12.9     25.7       0.0          NaN       NaN   
3  2008-12-04   Albury      9.2     28.0       0.0          NaN       NaN   
4  2008-12-05   Albury     17.5     32.3       1.0          NaN       NaN   

  WindGustDir  WindGustSpeed WindDir9am  ... Humidity9am  Humidity3pm  \
0           W           44.0          W  ...        71.0         22.0   
1         WNW           44.0        NNW  ...        44.0         25.0   
2         WSW           46.0          W  ...        38.0         30.0   
3          NE           24.0         SE  ...        45.0         16.0   
4           W           41.0        ENE  ...        82.0         33.0   

   Pressure9am  Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm  RainToday  \
0       1007.7    

In [92]:
df.count()

Date             145460
Location         145460
MinTemp          143975
MaxTemp          144199
Rainfall         142199
Evaporation       82670
Sunshine          75625
WindGustDir      135134
WindGustSpeed    135197
WindDir9am       134894
WindDir3pm       141232
WindSpeed9am     143693
WindSpeed3pm     142398
Humidity9am      142806
Humidity3pm      140953
Pressure9am      130395
Pressure3pm      130432
Cloud9am          89572
Cloud3pm          86102
Temp9am          143693
Temp3pm          141851
RainToday        142199
RainTomorrow     142193
dtype: int64

In [None]:
# Evaporation and Sunshine have a lot of missing values, we will drop them
df.drop(columns=['Evaporation', 'Sunshine'], inplace=True)

# Drop rows with missing values
df.dropna(inplace=True)

# Check for missing values
print('\nMissing values in the dataset:')
print(df.isnull().sum())

# Check the shape of the dataset
print('\nShape of dataset: ', df.shape)


Missing values in the dataset:
Date             0
Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
RainTomorrow     0
dtype: int64

Shape of dataset:  (71045, 21)


In [94]:
# Check the data types
print('\nData types:')
print(df.dtypes)



Data types:
Date              object
Location          object
MinTemp          float64
MaxTemp          float64
Rainfall         float64
WindGustDir       object
WindGustSpeed    float64
WindDir9am        object
WindDir3pm        object
WindSpeed9am     float64
WindSpeed3pm     float64
Humidity9am      float64
Humidity3pm      float64
Pressure9am      float64
Pressure3pm      float64
Cloud9am         float64
Cloud3pm         float64
Temp9am          float64
Temp3pm          float64
RainToday         object
RainTomorrow      object
dtype: object


In [None]:
# Check the unique values in the target variable
print('\nUnique values in the target variable:')
print(df['RainTomorrow'].unique())

# Check the distribution of the target variable
print('\nDistribution of the target variable:')
print(df['RainTomorrow'].value_counts(normalize=True))


Unique values in the target variable:
['No' 'Yes']

Distribution of the target variable:
RainTomorrow
No     0.76422
Yes    0.23578
Name: proportion, dtype: float64


In [96]:
features = ['MinTemp', 'MaxTemp', 'Humidity3pm', 'WindSpeed9am', 'Rainfall', 'RainToday']
target = 'RainTomorrow'

X = df[features]
y = df[target].map({'Yes': 1, 'No': 0})  # convert to binary classification

In [97]:
X = pd.get_dummies(X, drop_first=True)

In [98]:
# How balanced is the dataset?
y.value_counts()

RainTomorrow
0    54294
1    16751
Name: count, dtype: int64

In [99]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
