# 1. Data Collection

## 1.1 Load Necessary libraries

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score ,classification_report, confusion_matrix, roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

## 1.2 Load The Data from csv

In [None]:
# Load the dataset
data = pd.read_csv('weatherAUS.csv')
data.describe(include='all')

# Check the columns
print(data.columns.tolist())  # This will confirm the names

# Inspect the data
print(data.info())
print(data.describe())
print(data.isnull().sum())


# 2. Data Cleaning & Preparation

## 2.1 Handling The Missing Values & Feature Engineering

In [None]:
# Drop rows with missing target
data.dropna(subset=['RainTomorrow'], inplace=True)

# Fill missing values for numerical columns with mean and categorical with mode
for column in data.columns:
    if data[column].isnull().any():
        if data[column].dtype == 'object':
            data[column] = data[column].fillna(data[column].mode()[0])  # Fill categorical with mode
        else:
            data[column] = data[column].fillna(data[column].mean())  # Fill numerical with mean


# Convert categorical features to dummy variables
data = pd.get_dummies(data, columns=['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm'], drop_first=True)

# Convert date column and extract features (into month and day columns)
data['Date'] = pd.to_datetime(data['Date'])
data['month'] = data['Date'].dt.month
data['day'] = data['Date'].dt.dayofweek

# Define features and target
X = data.drop(['RainTomorrow', 'Date'], axis=1)  # Features
y = data['RainTomorrow']

# Scale numerical features
scaler = StandardScaler()
X[X.select_dtypes(include=['float64', 'int64']).columns] = scaler.fit_transform(X.select_dtypes(include=['float64', 'int64']))

# Check the data types in X
print("Data types in X:")
print(X.dtypes)

# Identify non-numeric columns (check before dropping)
non_numeric_columns = X.select_dtypes(exclude=['number']).columns.tolist()
if non_numeric_columns:
  print("Non-numeric columns detected:", non_numeric_columns)
  X.drop(columns=non_numeric_columns, inplace=True)

# Check for NaN values in features
print("Missing values in X:")
print(X.isnull().sum())

# Drop rows with NaN values in features
X.dropna(inplace=True)

# Align y with the filtered X
y = y[X.index]

# Ensure all columns in X are numeric
if not all(X.dtypes.apply(lambda x: pd.api.types.is_numeric_dtype(x))):
    raise ValueError("There are still non-numeric columns in X.")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# 3. Algorithm Understanding Model Development

In [None]:
# Initialize model
model = RandomForestClassifier()

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
}

# Grid Search
grid_search = GridSearchCV(model, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Print the best parameters
print("Best parameters found: ", grid_search.best_params_)


In [None]:
# Make predictions
y_pred = best_model.predict(X_test)

# Evaluate model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))



In [None]:
# Calculate and print additional evaluation metrics
roc_auc = roc_auc_score(y_test, best_model.predict_proba(X_test)[:, 1])
print(f"ROC AUC Score: {roc_auc:.4f}")


In [None]:
# Visualize confusion matrix
cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()