In [1]:
import warnings
warnings.simplefilter('ignore')

import dill
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, Lasso, Ridge
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, confusion_matrix

In [2]:
# Import the csv Data as Pnadas dataframe
df = pd.read_csv("data/weatherAUS.csv")
df['Date'] = df['Date'].astype('datetime64[ns]')
df.sample(5)

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
72426,2014-10-24,Mildura,14.9,35.3,0.0,8.0,9.1,WNW,33.0,W,...,60.0,19.0,1014.2,1013.1,8.0,6.0,19.5,32.5,No,No
33979,2010-04-05,SydneyAirport,15.1,20.9,3.4,4.4,2.5,SW,22.0,SW,...,79.0,65.0,1021.4,1019.1,5.0,6.0,18.9,20.5,Yes,Yes
128299,2013-04-09,Walpole,18.2,23.7,0.0,,,ESE,35.0,ESE,...,85.0,74.0,1018.6,1017.3,,,20.6,21.1,No,No
144147,2013-11-21,Uluru,18.3,29.3,1.0,,,S,41.0,S,...,84.0,37.0,1011.4,1007.5,,,19.8,27.5,No,No
27037,2017-02-16,Penrith,18.1,38.3,0.0,,,ENE,39.0,NNE,...,69.0,27.0,,,,,23.2,37.0,No,No


In [3]:
# Prepare the X and Y variables
x = df.drop(columns=['Date','RainToday', 'RainTomorrow'])
y=df['RainToday']
print('x shape : ', x.shape)
print('y shape : ',y.shape)

x shape :  (145460, 20)
y shape :  (145460,)


In [4]:
# Create group of the Numerical Features and Categorical Feature 
num_features = x.select_dtypes(exclude="O").columns
cat_features = x.select_dtypes(include="O").columns

In [5]:
print('*'*50)
print('Numeric Features : ', num_features)
print('*'*50)
print('Categorical Features : ', cat_features)
print('*'*50)

**************************************************
Numeric Features :  Index(['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustSpeed', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am',
       'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm',
       'Temp9am', 'Temp3pm'],
      dtype='object')
**************************************************
Categorical Features :  Index(['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm'], dtype='object')
**************************************************


In [6]:
from sklearn.impute import SimpleImputer  # handling missing values
from sklearn.preprocessing import StandardScaler  # handling featuer scaling
from sklearn.preprocessing import OneHotEncoder, LabelEncoder   # ordinal encoding
from sklearn.pipeline import Pipeline    # pipeline
from sklearn.compose import ColumnTransformer
le = LabelEncoder()
y = le.fit_transform(y)

In [7]:
for i in cat_features:
    print(f"Categories in '{i} variable : ", end=" ")
    print(df[i].unique())
    print("======================================================")

Categories in 'Location variable :  ['Albury' 'BadgerysCreek' 'Cobar' 'CoffsHarbour' 'Moree' 'Newcastle'
 'NorahHead' 'NorfolkIsland' 'Penrith' 'Richmond' 'Sydney' 'SydneyAirport'
 'WaggaWagga' 'Williamtown' 'Wollongong' 'Canberra' 'Tuggeranong'
 'MountGinini' 'Ballarat' 'Bendigo' 'Sale' 'MelbourneAirport' 'Melbourne'
 'Mildura' 'Nhil' 'Portland' 'Watsonia' 'Dartmoor' 'Brisbane' 'Cairns'
 'GoldCoast' 'Townsville' 'Adelaide' 'MountGambier' 'Nuriootpa' 'Woomera'
 'Albany' 'Witchcliffe' 'PearceRAAF' 'PerthAirport' 'Perth' 'SalmonGums'
 'Walpole' 'Hobart' 'Launceston' 'AliceSprings' 'Darwin' 'Katherine'
 'Uluru']
Categories in 'WindGustDir variable :  ['W' 'WNW' 'WSW' 'NE' 'NNW' 'N' 'NNE' 'SW' nan 'ENE' 'SSE' 'S' 'NW' 'SE'
 'ESE' 'E' 'SSW']
Categories in 'WindDir9am variable :  ['W' 'NNW' 'SE' 'ENE' 'SW' 'SSE' 'S' 'NE' nan 'SSW' 'N' 'WSW' 'ESE' 'E'
 'NW' 'WNW' 'NNE']
Categories in 'WindDir3pm variable :  ['WNW' 'WSW' 'E' 'NW' 'W' 'SSE' 'ESE' 'ENE' 'NNW' 'SSW' 'SW' 'SE' 'N' 'S'
 'NNE' nan '

In [8]:
# Numerical Pipeline

num_pipeline = Pipeline(
    steps=[
        ('SimpleImputer',SimpleImputer(strategy='median')),
        ('StandardScaler',StandardScaler())
    ]
)

# Categorical Pipeline

cat_pipeline = Pipeline(
    steps=[
        ('SimpleImputer',SimpleImputer(strategy='most_frequent')),
        ('oneHot',OneHotEncoder(sparse=False, drop='first', dtype=np.int16))
    ]
)

# combine num_pipeline and cat_pipeline

preprocessor = ColumnTransformer([
    ('num_pipeline',num_pipeline,num_features),
    ('categorical_pipeline',cat_pipeline,cat_features)
])

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((116368, 20), (29092, 20), (116368,), (29092,))

In [10]:
preprocessor.fit_transform(x_train)

array([[ 0.36199429, -0.76395034,  1.40548049, ...,  0.        ,
         0.        ,  0.        ],
       [ 1.94823195,  2.29793019, -0.27526853, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.39340494,  0.47773393, -0.25159601, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.72167303, -0.21365845, -0.27526853, ...,  0.        ,
         0.        ,  0.        ],
       [-0.53320915, -1.1872518 , -0.27526853, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.56616349,  2.45314073, -0.27526853, ...,  0.        ,
         0.        ,  1.        ]])

In [11]:
x_train = pd.DataFrame(preprocessor.fit_transform(x_train),columns=preprocessor.get_feature_names_out())
x_test = pd.DataFrame(preprocessor.transform(x_test),columns=preprocessor.get_feature_names_out())

In [12]:
from imblearn.combine import SMOTEENN
sme = SMOTEENN(random_state=42)
x_res, y_res = sme.fit_resample(x_train, y_train)

In [13]:
# Create the Evaluate Model
def evaluate_model(true, predicted):
    print("accuracy_score")
    accuracy_score1 = accuracy_score(true, predicted)
    print("classification_report")
    classification_report1 = classification_report(true, predicted)
    print("confusion_matrix")
    confusion_matrix1 = confusion_matrix(true, predicted)
    return accuracy_score1, classification_report1, confusion_matrix1

In [14]:
models = {
    "LogisticRegression":LogisticRegression(n_jobs=-1),
    "Lasso":Lasso(),
    "Ridge":Ridge(),
    "KNeighborsClassifier":KNeighborsClassifier(n_jobs=-1),
    "DecisionTree": DecisionTreeClassifier(),
    "Random Forest Classifier":RandomForestClassifier(),
    "XGBClassifier": XGBClassifier(n_jobs=-1),
    "CatBoostClassifier":CatBoostClassifier(verbose=False),
    "SVC": SVC(),
    "AdaBoostClassifier": AdaBoostClassifier(),
    "GaussianNB": GaussianNB(),
}

In [15]:
model_list = []
acuracy_score_list = []
class_report = []
matrics = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_res, y_res)

    y_pred = model.predict(x_test)

    model_accuracy, model_classification_report, model_confusion_matrix = evaluate_model(y_test, y_pred.round())
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("Model Performance for Testing set")
    print("Model accuarcy : {}".format(model_accuracy))
    print("="*50)
    print("classification_report :")
    print(model_classification_report)
    print("="*50)
    print("Confusion Matrix")
    print(model_confusion_matrix)

    acuracy_score_list.append(model_accuracy)
    class_report.append(model_classification_report)
    matrics.append(model_confusion_matrix)
    print("*"*50)
    print()

accuracy_score
classification_report
confusion_matrix
LogisticRegression
Model Performance for Testing set
Model accuarcy : 0.8043104633576241
classification_report :
              precision    recall  f1-score   support

           0       0.99      0.75      0.85     22039
           1       0.95      1.00      0.98      6396
           2       0.10      0.88      0.18       657

    accuracy                           0.80     29092
   macro avg       0.68      0.87      0.67     29092
weighted avg       0.97      0.80      0.86     29092

Confusion Matrix
[[16444   303  5292]
 [   20  6376     0]
 [   78     0   579]]
**************************************************

accuracy_score
classification_report
confusion_matrix
Lasso
Model Performance for Testing set
Model accuarcy : 0.21985425546542003
classification_report :
              precision    recall  f1-score   support

           0       0.00      0.00      0.00     22039
           1       0.22      1.00      0.36      6396
 