In [5]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,f1_score, precision_score, recall_score


loading data

In [7]:
raw_df = pd.read_csv('weatherAUS.csv')
raw_df.dropna(subset=['RainTomorrow'], inplace=True)

Spliting data into train and test based on year

In [9]:
year = pd.to_datetime(raw_df.Date).dt.year
train_df, test_df = raw_df[year <= 2015], raw_df[year > 2015]

In [10]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 116219 entries, 0 to 144917
Data columns (total 23 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   Date           116219 non-null  object 
 1   Location       116219 non-null  object 
 2   MinTemp        115742 non-null  float64
 3   MaxTemp        115990 non-null  float64
 4   Rainfall       115077 non-null  float64
 5   Evaporation    70646 non-null   float64
 6   Sunshine       66358 non-null   float64
 7   WindGustDir    108349 non-null  object 
 8   WindGustSpeed  108409 non-null  float64
 9   WindDir9am     107782 non-null  object 
 10  WindDir3pm     113669 non-null  object 
 11  WindSpeed9am   115004 non-null  float64
 12  WindSpeed3pm   114709 non-null  float64
 13  Humidity9am    114733 non-null  float64
 14  Humidity3pm    114365 non-null  float64
 15  Pressure9am    104885 non-null  float64
 16  Pressure3pm    104923 non-null  float64
 17  Cloud9am       73862 non-null   fl

In [11]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25974 entries, 2498 to 145458
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Date           25974 non-null  object 
 1   Location       25974 non-null  object 
 2   MinTemp        25814 non-null  float64
 3   MaxTemp        25881 non-null  float64
 4   Rainfall       25710 non-null  float64
 5   Evaporation    10704 non-null  float64
 6   Sunshine       8019 non-null   float64
 7   WindGustDir    24514 non-null  object 
 8   WindGustSpeed  24514 non-null  float64
 9   WindDir9am     24398 non-null  object 
 10  WindDir3pm     24746 non-null  object 
 11  WindSpeed9am   25841 non-null  float64
 12  WindSpeed3pm   24854 non-null  float64
 13  Humidity9am    25686 non-null  float64
 14  Humidity3pm    24218 non-null  float64
 15  Pressure9am    23294 non-null  float64
 16  Pressure3pm    23289 non-null  float64
 17  Cloud9am       14674 non-null  float64
 18  Cloud3p

defining inputs and target columns

In [13]:
input_cols = list(train_df.columns)[1:-1]
target_col = 'RainTomorrow'
train_inputs, train_targets = train_df[input_cols].copy(), train_df[target_col].copy()
test_inputs, test_targets = test_df[input_cols].copy(), test_df[target_col].copy()

Identifying numeric and categorical columns

In [15]:
numeric_cols = train_inputs.select_dtypes(include=np.number).columns.tolist()
categorical_cols = train_inputs.select_dtypes(include='object').columns.tolist()

In [16]:
numeric_cols

['MinTemp',
 'MaxTemp',
 'Rainfall',
 'Evaporation',
 'Sunshine',
 'WindGustSpeed',
 'WindSpeed9am',
 'WindSpeed3pm',
 'Humidity9am',
 'Humidity3pm',
 'Pressure9am',
 'Pressure3pm',
 'Cloud9am',
 'Cloud3pm',
 'Temp9am',
 'Temp3pm']

In [17]:
categorical_cols

['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday']

Imputing missing numerical values

In [19]:
imputer = SimpleImputer(strategy='mean')
train_inputs[numeric_cols] = imputer.fit_transform(train_inputs[numeric_cols])
test_inputs[numeric_cols] = imputer.transform(test_inputs[numeric_cols])

In [20]:
categorical_imputer = SimpleImputer(strategy='most_frequent')
train_inputs[categorical_cols] = categorical_imputer.fit_transform(train_inputs[categorical_cols])

test_inputs[categorical_cols] = categorical_imputer.transform(test_inputs[categorical_cols])


In [21]:
null_counts = train_inputs.isnull().sum()
print(null_counts)

Location         0
MinTemp          0
MaxTemp          0
Rainfall         0
Evaporation      0
Sunshine         0
WindGustDir      0
WindGustSpeed    0
WindDir9am       0
WindDir3pm       0
WindSpeed9am     0
WindSpeed3pm     0
Humidity9am      0
Humidity3pm      0
Pressure9am      0
Pressure3pm      0
Cloud9am         0
Cloud3pm         0
Temp9am          0
Temp3pm          0
RainToday        0
dtype: int64


Scaling numeric features

In [23]:
scaler = MinMaxScaler()
train_inputs[numeric_cols] = scaler.fit_transform(train_inputs[numeric_cols])
test_inputs[numeric_cols] = scaler.transform(test_inputs[numeric_cols])


In [24]:
train_inputs

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,WindDir3pm,...,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday
0,Albury,0.516509,0.517241,0.001617,0.065137,0.525492,W,0.294574,W,WNW,...,0.275862,0.71,0.22,0.449587,0.463696,0.888889,0.491962,0.497845,0.525391,No
1,Albury,0.375000,0.559387,0.000000,0.065137,0.525492,WNW,0.294574,NNW,WSW,...,0.252874,0.44,0.25,0.497521,0.475248,0.481556,0.491962,0.504310,0.574219,No
2,Albury,0.504717,0.570881,0.000000,0.065137,0.525492,WSW,0.310078,W,WSW,...,0.298851,0.38,0.30,0.447934,0.490099,0.481556,0.222222,0.586207,0.552734,No
3,Albury,0.417453,0.614943,0.000000,0.065137,0.525492,NE,0.139535,SE,E,...,0.103448,0.45,0.16,0.613223,0.557756,0.481556,0.491962,0.523707,0.617188,No
4,Albury,0.613208,0.697318,0.002695,0.065137,0.525492,W,0.271318,ENE,NW,...,0.229885,0.82,0.33,0.500826,0.445545,0.777778,0.888889,0.517241,0.679688,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144913,Uluru,0.683962,0.743295,0.000000,0.065137,0.525492,E,0.356589,ESE,E,...,0.229885,0.23,0.12,0.540496,0.513201,0.481556,0.491962,0.657328,0.744141,No
144914,Uluru,0.625000,0.775862,0.000000,0.065137,0.525492,ESE,0.372093,E,ESE,...,0.356322,0.17,0.07,0.565289,0.526403,0.481556,0.491962,0.709052,0.783203,No
144915,Uluru,0.613208,0.789272,0.000000,0.065137,0.525492,E,0.387597,E,SE,...,0.252874,0.12,0.07,0.530579,0.470297,0.481556,0.491962,0.739224,0.777344,No
144916,Uluru,0.672170,0.823755,0.000000,0.065137,0.525492,E,0.410853,E,SSE,...,0.195402,0.12,0.12,0.441322,0.389439,0.481556,0.111111,0.808190,0.849609,No


In [25]:
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
train_encoded = encoder.fit_transform(train_inputs[categorical_cols])
test_encoded = encoder.transform(test_inputs[categorical_cols])


In [26]:
train_encoded.shape

(116219, 99)

In [27]:
test_encoded.shape

(25974, 99)

In [28]:
encoded_col_names = encoder.get_feature_names_out(categorical_cols)
train_encoded_df = pd.DataFrame(train_encoded, columns=encoded_col_names, index=train_inputs.index)
test_encoded_df = pd.DataFrame(test_encoded, columns=encoded_col_names, index=test_inputs.index)

In [29]:
encoded_col_names

array(['Location_Adelaide', 'Location_Albany', 'Location_Albury',
       'Location_AliceSprings', 'Location_BadgerysCreek',
       'Location_Ballarat', 'Location_Bendigo', 'Location_Brisbane',
       'Location_Cairns', 'Location_Canberra', 'Location_Cobar',
       'Location_CoffsHarbour', 'Location_Dartmoor', 'Location_Darwin',
       'Location_GoldCoast', 'Location_Hobart', 'Location_Katherine',
       'Location_Launceston', 'Location_Melbourne',
       'Location_MelbourneAirport', 'Location_Mildura', 'Location_Moree',
       'Location_MountGambier', 'Location_MountGinini',
       'Location_Newcastle', 'Location_Nhil', 'Location_NorahHead',
       'Location_NorfolkIsland', 'Location_Nuriootpa',
       'Location_PearceRAAF', 'Location_Penrith', 'Location_Perth',
       'Location_PerthAirport', 'Location_Portland', 'Location_Richmond',
       'Location_Sale', 'Location_SalmonGums', 'Location_Sydney',
       'Location_SydneyAirport', 'Location_Townsville',
       'Location_Tuggeranong', 

In [30]:
train_encoded_df

Unnamed: 0,Location_Adelaide,Location_Albany,Location_Albury,Location_AliceSprings,Location_BadgerysCreek,Location_Ballarat,Location_Bendigo,Location_Brisbane,Location_Cairns,Location_Canberra,...,WindDir3pm_S,WindDir3pm_SE,WindDir3pm_SSE,WindDir3pm_SSW,WindDir3pm_SW,WindDir3pm_W,WindDir3pm_WNW,WindDir3pm_WSW,RainToday_No,RainToday_Yes
0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
144914,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
144915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
144916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [31]:
train_inputs = pd.concat([train_inputs[numeric_cols], train_encoded_df], axis=1)
test_inputs = pd.concat([test_inputs[numeric_cols], test_encoded_df], axis=1)


In [32]:
null_counts = train_inputs.isnull().sum()
null = train_targets.isnull().sum()
print(null)

0


In [33]:
model = LogisticRegression()
model.fit(train_inputs, train_targets)

In [34]:
train_preds = model.predict(train_inputs)
train_accuracy = accuracy_score(train_targets, train_preds)
print("Training Accuracy: ",train_accuracy)

Training Accuracy:  0.8495770915254821


In [35]:
test_preds = model.predict(test_inputs)
test_accuracy = accuracy_score(test_targets, test_preds)
print("Test Accuracy:",test_accuracy)

Test Accuracy: 0.8403788403788404


In [36]:
test_f1 = f1_score(test_targets, test_preds, pos_label='Yes')
test_f1

0.5822249093107618

In [37]:
from sklearn.metrics import precision_score, recall_score, classification_report

print(classification_report(test_targets, test_preds))

              precision    recall  f1-score   support

          No       0.86      0.95      0.90     20028
         Yes       0.73      0.49      0.58      5946

    accuracy                           0.84     25974
   macro avg       0.79      0.72      0.74     25974
weighted avg       0.83      0.84      0.83     25974



For class 'No': Precision is 0.86 (meaning 86% of the predictions for 'No' are correct).
For class 'Yes': Precision is 0.73 (meaning 73% of the predictions for 'Yes' are correct).
For class 'No': Recall is 0.95 (meaning 95% of the actual 'No' values were correctly predicted).
For class 'Yes': Recall is 0.49 (meaning 49% of the actual 'Yes' values were correctly predicted)
For class 'No': The F1 score is 0.90, which is good.
For class 'Yes': The F1 score is 0.58, which is much lower.happenend due to class imbalance.

In [39]:
from sklearn.ensemble import RandomForestClassifier

In [40]:
rf_model = RandomForestClassifier(class_weight='balanced')

rf_model.fit(train_inputs, train_targets)
rf_preds = rf_model.predict(test_inputs)

In [41]:
rf_accuracy = accuracy_score(test_targets, rf_preds)
rf_f1 = f1_score(test_targets, rf_preds, pos_label='Yes')

print(classification_report(test_targets, rf_preds))

              precision    recall  f1-score   support

          No       0.86      0.95      0.90     20028
         Yes       0.76      0.47      0.58      5946

    accuracy                           0.84     25974
   macro avg       0.81      0.71      0.74     25974
weighted avg       0.83      0.84      0.83     25974



In [None]:
def predict_input(single_input):
    input_df = pd.DataFrame([single_input])
    input_df[numeric_cols] = imputer.transform(input_df[numeric_cols])
    input_df[numeric_cols] = scaler.transform(input_df[numeric_cols])
    input_df[encoded_cols] = encoder.transform(input_df[categorical_cols])
    X_input = input_df[numeric_cols + encoded_cols]
    pred = model.predict(X_input)[0]
    return pred, prob

In [None]:
new_input = {'Date': '2021-06-19',
             'Location': 'Launceston',
             'MinTemp': 23.2,
             'MaxTemp': 33.2,
             'Rainfall': 10.2,
             'Evaporation': 4.2,
             'Sunshine': np.nan,
             'WindGustDir': 'NNW',
             'WindGustSpeed': 52.0,
             'WindDir9am': 'NW',
             'WindDir3pm': 'NNE',
             'WindSpeed9am': 13.0,
             'WindSpeed3pm': 20.0,
             'Humidity9am': 89.0,
             'Humidity3pm': 58.0,
             'Pressure9am': 1004.8,
             'Pressure3pm': 1001.5,
             'Cloud9am': 8.0,
             'Cloud3pm': 5.0,
             'Temp9am': 25.7,
             'Temp3pm': 33.0,
             'RainToday': 'Yes'}

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix

cross_val_score_rf = cross_val_score(best_rf, X_train_resampled, y_train_resampled, cv=5, scoring='f1_weighted')
print("Cross-validation F1 Score for Random Forest:", cross_val_score_rf)

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_train_resampled, y_train_resampled = smote.fit_resample(train_inputs, train_targets)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'class_weight': ['balanced', None],
    'max_features': ['auto', 'sqrt', 'log2']
}


rf = RandomForestClassifier()
grid_search_rf = GridSearchCV(rf, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
grid_search_rf.fit(train_inputs, train_targets)

best_params_rf = grid_search_rf.best_params_
print("Best Hyperparameters for Random Forest:", best_params_rf)
best_rf = grid_search_rf.best_estimator_
best_rf.fit(X_train, y_train)
