<a href="https://colab.research.google.com/github/munnurumahesh03-coder/machine-learning-for-classification/blob/main/Capstone_Project_For_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Stacking Classifier (3 Models)**

---



In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
warnings.filterwarnings('ignore')

In [None]:
weather_data = pd.read_csv('australia.csv')

In [None]:
weather_data

# **Exploratory Data Analysis**

---



In [None]:
weather_data.info()

In [None]:
weather_data.describe()

In [None]:
weather_data.isnull().sum()

In [None]:
print(f"Original number of rows: {len(weather_data)}")
weather_data.dropna(subset=['RainTomorrow'], inplace=True)
print(f"Number of rows after dropping missing target values: {len(weather_data)}")

In [None]:
print("\nDistribution of 'RainTomorrow':")
print(weather_data['RainTomorrow'].value_counts(normalize=True))

In [None]:
print("\nConverting target variable to numerical format (0 for 'No', 1 for 'Yes')...")
weather_data['RainTomorrow'] = weather_data['RainTomorrow'].map({'No': 0, 'Yes': 1})
print("Conversion complete.")
print("First 5 values of the transformed 'RainTomorrow' column:")
print(weather_data['RainTomorrow'].head())

In [None]:
weather_data.info()

In [None]:
weather_data.describe()

# **Visualization**

---



In [None]:
plt.figure(figsize=(8, 6))
ax = plt.gca()

sns.countplot(
    data=weather_data,
    x='RainTomorrow',
    ax=ax,
    palette='viridis'
)
ax.set_title('Distribution of Target Variable: RainTomorrow', fontsize=16)
ax.set_xlabel('Will it Rain Tomorrow?', fontsize=12)
ax.set_ylabel('Count', fontsize=12)

In [None]:
plt.figure(figsize=(10, 6))
ax = plt.gca()


sns.histplot(
    data=weather_data,
    x='Rainfall',
    ax=ax,
    bins=50
)

ax.set_yscale('log')

ax.set_title('Distribution of Rainfall (Log Scale)', fontsize=16)
ax.set_xlabel('Rainfall (mm)', fontsize=12)
ax.set_ylabel('Frequency (Log Scale)', fontsize=12)

In [None]:
plt.figure(figsize=(8, 6))

sns.boxplot(
    data=weather_data,
    x='RainTomorrow',
    y='Humidity3pm',
    palette='viridis'
)

plt.title('Humidity at 3pm vs. Rain Tomorrow', fontsize=16)
plt.xlabel('Did it Rain Tomorrow?', fontsize=12)
plt.ylabel('Humidity at 3pm (%)', fontsize=12)

In [None]:
plt.figure(figsize=(8, 6))

sns.boxplot(
    data=weather_data,
    x='RainTomorrow',
    y='Sunshine',
    palette='plasma'
)

plt.title('Hours of Sunshine vs. Rain Tomorrow', fontsize=16)
plt.xlabel('Did it Rain Tomorrow?', fontsize=12)
plt.ylabel('Sunshine (Hours)', fontsize=12)

In [None]:
plt.figure(figsize=(16, 12)) # Create a new, separate figure for the large heatmap
correlation_matrix = weather_data.select_dtypes(include=['float64', 'int64']).corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.1f', linewidths=.5)
plt.title('Correlation Heatmap of All Numerical Features', fontsize=18)

# **Feature Engineering**

---



In [None]:
# --- Make a copy to avoid changing the original raw data ---
df_eng = weather_data.copy()

# --- 1. Date-Based Features ---
# First, ensure the 'Date' column is in datetime format
df_eng['Date'] = pd.to_datetime(df_eng['Date'])

# Extract year, month, and day
df_eng['Year'] = df_eng['Date'].dt.year
df_eng['Month'] = df_eng['Date'].dt.month
df_eng['Day'] = df_eng['Date'].dt.day

# It's often useful to know the day of the year as well
df_eng['DayOfYear'] = df_eng['Date'].dt.dayofyear

print("Created date-based features: Year, Month, Day, DayOfYear")

In [None]:
# Daily temperature range
df_eng['TempRange'] = df_eng['MaxTemp'] - df_eng['MinTemp']

# Average temperature for the day
df_eng['AvgTemp'] = (df_eng['MinTemp'] + df_eng['MaxTemp']) / 2

print("Created temperature-based features: TempRange, AvgTemp")

In [None]:
df_eng['PressureChange'] = df_eng['Pressure3pm'] - df_eng['Pressure9am']

print("Created pressure-based feature: PressureChange")

In [None]:
df_eng['AvgWindSpeed'] = (df_eng['WindSpeed9am'] + df_eng['WindSpeed3pm']) / 2

print("Created wind-based feature: AvgWindSpeed")

In [None]:
df_eng['HumidityTemp_Interaction'] = df_eng['AvgTemp'] * (df_eng['Humidity3pm'] * 0.01)

print("Created interaction feature: HumidityTemp_Interaction")

In [None]:
df_eng = df_eng.drop('Date', axis=1)
print("\nDropped the original 'Date' column.")

In [None]:
#  --- Display the results ---
print("\n--- DataFrame with New Features (first 5 rows) ---")
display(df_eng.head())

print(f"\nOriginal number of columns: {len(weather_data.columns)}")
print(f"New number of columns: {len(df_eng.columns)}")

In [None]:
print("\n--- Preparing 'RainToday' Feature ---")
if 'RainToday' in df_eng.columns and df_eng['RainToday'].dtype == 'object':
    df_eng['RainToday'] = df_eng['RainToday'].map({'No': 0, 'Yes': 1})
    print("Converted 'RainToday' to 0s and 1s. It will be treated as CATEGORICAL in the pipeline.")

print("\n--- Final DataFrame Info ---")
df_eng.info()

# **Train Test Split**

---



In [None]:
# --- Splitting the data based on the 'Year' column ---
train_df = df_eng[df_eng.Year < 2015].copy()
val_df = df_eng[df_eng.Year == 2015].copy()
test_df = df_eng[df_eng.Year > 2015].copy()

# --- Verify the Shapes ---
print("--- DataFrame Shapes after Time-Based Splitting ---")
print("Training DataFrame shape:", train_df.shape)
print("Validation DataFrame shape:", val_df.shape)
print("Test DataFrame shape:", test_df.shape)

In [None]:
train_df

In [None]:
val_df

In [None]:
test_df

# **Input and Target Columns**

---



In [None]:
target_column = 'RainTomorrow'

# Create Training sets
X_train = train_df.drop(target_column, axis=1)
y_train = train_df[target_column]

In [None]:
# Create Validation sets
X_val = val_df.drop(target_column, axis=1)
y_val = val_df[target_column]

In [None]:
# Create Test sets
X_test = test_df.drop(target_column, axis=1)
y_test = test_df[target_column]

In [None]:
# --- Verify the Shapes ---
print("--- Final Dataset Shapes ---")
print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of y_train: {y_train.shape}")
print("-" * 30)
print(f"Shape of X_val:   {X_val.shape}")
print(f"Shape of y_val:   {y_val.shape}")
print("-" * 30)
print(f"Shape of X_test:  {X_test.shape}")
print(f"Shape of y_test:  {y_test.shape}")

# **CatBoost**

---



# **Pipeline**

---



In [None]:
# --- 1. Imports ---
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, FunctionTransformer
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier

# --- 2. Define Feature Lists ---
numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(exclude=np.number).columns.tolist()

if 'RainToday' in numerical_features:
    numerical_features.remove('RainToday')
    categorical_features.append('RainToday')

# --- 3. Define the Data Type Conversion Function ---
# This function will be applied to our categorical features.
def to_string(df):
    return df.astype(str)

# --- 4. Create the Definitive Preprocessing Pipeline ---
# This version includes the crucial data type conversion step.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_features),

        ('cat', Pipeline(steps=[
            # STEP 1: Impute missing values first.
            ('imputer', SimpleImputer(strategy='constant', fill_value='Missing')),
            # STEP 2: Convert all values in these columns to strings.
            ('caster', FunctionTransformer(to_string))
        ]), categorical_features)
    ],
    remainder='drop'
)

# --- 5. Get Categorical Feature Indices ---
# This logic remains the same and is correct.
cat_feature_indices = list(range(len(numerical_features), len(numerical_features) + len(categorical_features)))

print(f"Numerical features count: {len(numerical_features)}")
print(f"Categorical features count: {len(categorical_features)}")
print(f"CatBoost will receive categorical features at indices: {cat_feature_indices}")

# --- 6. Create the Final CatBoost Pipeline ---
catboost_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', CatBoostClassifier(
        cat_features=cat_feature_indices,
        task_type='GPU',
        random_state=42,
        verbose=0
    ))
])

print("\n✅ Final, corrected CatBoost pipeline created successfully.")
display(catboost_pipeline)

# **Evaluation and Selection**

---



In [None]:
print("Training the cat pipeline...")
catboost_pipeline.fit(X_train, y_train)
print("✅ Training complete.")

In [None]:
from sklearn.metrics import f1_score, roc_auc_score, classification_report

print("\nMaking predictions on the validation data...")
val_preds = catboost_pipeline.predict(X_val)
val_preds_proba = catboost_pipeline.predict_proba(X_val)[:, 1] # Get probabilities for AUC-ROC

print("\n--- Baseline catboost Evaluation ---")
f1 = f1_score(y_val, val_preds)
roc_auc = roc_auc_score(y_val, val_preds_proba)
print(f"F1-Score: {f1:.4f}")
print(f"AUC-ROC:  {roc_auc:.4f}")

print("\n--- Classification Report ---")
print(classification_report(y_val, val_preds))

# **Saving through Joblib**

---



In [None]:
import joblib

model_filename = '09_catboost_champion.joblib'

print(f"--- 💾 Saving model to '{model_filename}' in the local session ---")
joblib.dump(catboost_pipeline, model_filename)

print(f"\n✅ Success! The champion CatBoost model has been saved locally.")

# **Logistic Regression**

---



# **Pipeline**

---



In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

# 1. Identify Numerical and Categorical Columns from X_train
# -----------------------------------------------------------
# Note: We explicitly drop 'Year' as it was only for splitting and we don't want it in the model.
numerical_features = X_train.select_dtypes(include=np.number).columns.drop('Year').tolist()
categorical_features = X_train.select_dtypes(exclude=np.number).columns.tolist()

# The 'RainToday' column is currently a float (0.0/1.0), but it's conceptually categorical.
# Let's move it to the categorical list to be one-hot encoded.
if 'RainToday' in numerical_features:
    numerical_features.remove('RainToday')
    categorical_features.append('RainToday')

print(f"Identified {len(numerical_features)} numerical features for the pipeline.")
print(f"Identified {len(categorical_features)} categorical features for the pipeline.")

# 2. Construct the Preprocessing Pipelines
# ----------------------------------------
# Numerical pipeline: Median imputation + Standard scaling
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Categorical pipeline: Missing value imputation + One-hot encoding
categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent',fill_value='unknown')), # Using most_frequent is a safe bet for categoricals
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'))
])

# 3. Combine Preprocessing Steps with ColumnTransformer

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ],
    remainder='drop' # Drop any columns not specified (like 'Year')
)

# 4. Create the Full Model Pipeline
# ---------------------------------
# This chains the preprocessor and the classifier together.
model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42)) # liblinear is good for this dataset size
])

print("\n✅ Preprocessing and full model pipelines created successfully!")

model_pipeline

# **GridSearch CV**

---



In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Assume 'preprocessor', 'X_train', and 'y_train' are already defined.

# --- 1. Define the Model Pipeline ---

model_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42))
])

# --- 2. Define the Hyperparameter Grid to Search ---

param_grid = {
    'classifier__C': [0.1, 1.0, 10, 100],
    'classifier__class_weight': [None, 'balanced']
}

# --- 3. Set up and Run GridSearchCV ---
print("--- Running GridSearchCV for Logistic Regression ---")

grid_search = GridSearchCV(
    estimator=model_pipeline,
    param_grid=param_grid,
    scoring='f1',
    cv=5,
    n_jobs=-1,
    verbose=1
)

# This is the main training and tuning step.
grid_search.fit(X_train, y_train)

# --- 4. Display the Results ---
print("\n✅ GridSearchCV Complete.")
print(f"Best F1-Score found during cross-validation: {grid_search.best_score_:.4f}")
print(f"Best Hyperparameters found: {grid_search.best_params_}")

# You can also view the detailed results for all combinations
cv_results_df = pd.DataFrame(grid_search.cv_results_)
print("\n--- Detailed CV Results (Top 5) ---")
display(cv_results_df[['param_classifier__C', 'param_classifier__class_weight', 'mean_test_score', 'std_test_score', 'rank_test_score']].sort_values('rank_test_score').head())

# The 'grid_search' object itself is now the best version of the model,
# retrained on all the training data, ready for evaluation or saving.
best_lr_model = grid_search.best_estimator_

# **Evaluation and Selection**

---



In [None]:
est_lr_model = grid_search.best_estimator_

print("✅ Best model extracted from GridSearchCV.")
print(f"The best model has parameters: C=0.1 and class_weight=None")

In [None]:
from sklearn.metrics import f1_score, roc_auc_score

# --- Step 1: Get the best model from the completed Grid Search ---
best_lr_model = grid_search.best_estimator_
print("✅ Best model extracted from GridSearchCV.")

# --- Step 2: Make predictions on the validation set ---
print("Making predictions on the unseen validation data...")
val_preds = best_lr_model.predict(X_val)
val_preds_proba = best_lr_model.predict_proba(X_val)[:, 1] # Get probabilities for the 'Yes' class for AUC

# --- Step 3: Calculate and print the final scores ---
final_f1_score = f1_score(y_val, val_preds)
final_auc_score = roc_auc_score(y_val, val_preds_proba)

print("\n--- Official Gauntlet Score for Logistic Regression ---")
print(f"Validation F1-Score: {final_f1_score:.4f}")
print(f"Validation AUC-ROC:  {final_auc_score:.4f}")
print("----------------------------------------------------")
print("\nThis is the score to beat for all future models.")

# **Saving through Joblib**

---



In [None]:
import joblib

correct_model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', random_state=42, class_weight='balanced', C=10))
])

correct_model.fit(X_train, y_train)
print("✅ Corrected model trained successfully.")


model_filename = '01_logistic_regression_tuned.joblib'
joblib.dump(correct_model, model_filename)
print(f"\n✅ Better model has been saved to '{model_filename}'.")
print("The old model file has been overwritten.")

# **Random Forest**

---



In [None]:
# This cell installs RAPIDS (which includes cuML).
# The installation process will take approximately 10-15 minutes.
#
# IMPORTANT: The session will automatically restart after the installation is
# complete. This is a normal and required part of the process.

!git clone https://github.com/rapidsai/rapidsai-csp-utils.git
!python rapidsai-csp-utils/colab/pip-install.py

# **Pipeline**

---



In [None]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from cuml.ensemble import RandomForestClassifier as cuMLRandomForest

# We assume 'X_train' is your training DataFrame, ready to be used.

# --- 1. Identify Numerical and Categorical Columns ---
# This process remains consistent.
numerical_features = X_train.select_dtypes(include=np.number).columns.tolist()
categorical_features = X_train.select_dtypes(exclude=np.number).columns.tolist()

# Move 'RainToday' to the categorical list
if 'RainToday' in numerical_features:
    numerical_features.remove('RainToday')
    categorical_features.append('RainToday')

# Create the final list of numerical columns to be processed, excluding 'Year'.
final_numerical_features = [col for col in numerical_features if col != 'Year']

print(f"Identified {len(final_numerical_features)} numerical features for the pipeline.")
print(f"Identified {len(categorical_features)} categorical features.")


# --- 2. Define the Preprocessing Steps (Our Standard Preprocessor) ---
numerical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()) # Kept for consistency
])

categorical_pipeline = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])


# --- 3. Create the Master Preprocessor ---
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_pipeline, final_numerical_features),
        ('cat', categorical_pipeline, categorical_features)
    ],
    remainder='drop'
)


# --- 4. Create the Final Random Forest Pipeline ---

rf_gpu_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', cuMLRandomForest(random_state=42)) # Use the cuML version
])


print("\n✅ Full Random Forest pipeline created successfully.")
print("\nPipeline Steps:")
display(rf_gpu_pipeline)

# **RandomizedSearch CV**

---



In [None]:
import cudf
from cuml.ensemble import RandomForestClassifier as cuMLRandomForest
from sklearn.model_selection import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from scipy.stats import randint

# --- 2. Create the Final GPU Pipeline ---

rf_gpu_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', cuMLRandomForest(random_state=42))
])

# --- 3. Define the Hyperparameter Grid ---
param_dist = {
    'classifier__n_estimators': randint(100, 500),
    'classifier__max_depth': randint(10, 25),
    'classifier__min_samples_split': randint(2, 20),
    'classifier__min_samples_leaf': randint(1, 20),
    'classifier__max_features': ['sqrt', 'log2', 0.5, 0.7]
}

# --- 4. Set up and Run RandomizedSearchCV ---

random_search_gpu = RandomizedSearchCV(
    estimator=rf_gpu_pipeline,
    param_distributions=param_dist,
    n_iter=10,
    cv=2,
    scoring='f1',
    refit=True,
    random_state=42,
    verbose=1
)

print("\n🚀 Starting FINAL GPU-Accelerated Randomized Search...")
# We fit on the original pandas DataFrames. The pipeline handles the rest.
random_search_gpu.fit(X_train, y_train)
print("\n✅ FINAL GPU-Accelerated Randomized Search Complete.")

# --- 5. Display Results ---
print("\n🏆 Best Hyperparameters Found:")
print(random_search_gpu.best_params_)
print("\nBest F1-Score from Cross-Validation:")
print(f"{random_search_gpu.best_score_:.4f}")

# **Evaluation and Selection**

---



In [None]:
# --- 1. Imports ---
from sklearn.metrics import f1_score, roc_auc_score, classification_report, confusion_matrix
import plotly.figure_factory as ff

# --- 2. Extract the Best Model ---
best_rf_model = random_search_gpu.best_estimator_
print("✅ Best Random Forest model extracted from RandomizedSearchCV.")
print(f"The model has parameters: {random_search_gpu.best_params_}")


# --- 3. Make Predictions on the Validation Set ---
print("\nMaking predictions on the unseen validation data...")

# ALWAYS pass a pandas DataFrame to the start of an sklearn pipeline.
# The pipeline will handle the CPU-to-GPU data transfer internally.
val_preds = best_rf_model.predict(X_val)
val_preds_proba = best_rf_model.predict_proba(X_val)[:, 1]


# --- 4. Calculate and Print the Final Scores ---
final_f1_score = f1_score(y_val, val_preds)
final_auc_score = roc_auc_score(y_val, val_preds_proba)

print("\n--- 🏆 Official Gauntlet Score for Tuned Random Forest 🏆 ---")
print(f"Validation F1-Score: {final_f1_score:.4f}")
print(f"Validation AUC-ROC:  {final_auc_score:.4f}")
print("-----------------------------------------------------------------")


# --- 5. Display the Full Classification Report ---
print("\n--- Full Classification Report ---")
report = classification_report(y_val, val_preds, target_names=['No Rain', 'Rain'])
print(report)


# --- 6. Display the Confusion Matrix using Plotly ---
print("\n--- Confusion Matrix ---")
conf_matrix = confusion_matrix(y_val, val_preds)
fig = ff.create_annotated_heatmap(
    z=conf_matrix,
    x=['Predicted No Rain', 'Predicted Rain'],
    y=['Actual No Rain', 'Actual Rain'],
    colorscale='Blues',
    showscale=True
)
fig.update_layout(title_text='Confusion Matrix (Tuned Random Forest)')
fig.show()

# **Saving through Joblib**

---



In [None]:
import joblib

model_filename = '05_random_forest_tuned.joblib'

best_rf_model = random_search_gpu.best_estimator_

joblib.dump(best_rf_model, model_filename)

print(f"✅ Model successfully saved to the temporary session storage as '{model_filename}'.")