<a href="https://colab.research.google.com/github/naqqaash/DSPL-Project--KJ-Marketing/blob/main/Rf_model_testing_%26_deployment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE


df = pd.read_csv("/content/processed_data.csv")


df.dropna(subset=['cluster_catgeory'], inplace=True)
df.fillna(df.median(numeric_only=True), inplace=True)


X = df.drop(columns=['cluster_catgeory'])
y = df['cluster_catgeory']


min_samples = 3
class_counts = y.value_counts()
valid_classes = class_counts[class_counts >= min_samples].index
df = df[df['cluster_catgeory'].isin(valid_classes)]

X = df.drop(columns=['cluster_catgeory'])
y = df['cluster_catgeory']


smallest_class_size = y.value_counts().min()
k_value = min(5, smallest_class_size - 1)
if k_value >= 1:
    smote = SMOTE(random_state=42, k_neighbors=k_value)
    X_resampled, y_resampled = smote.fit_resample(X, y)
else:
    X_resampled, y_resampled = X, y


X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)


scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("✅ Data Preprocessing Complete! Ready for Model Training.")


✅ Data Preprocessing Complete! Ready for Model Training.


In [None]:

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


param_dist = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}


rf_model = RandomForestClassifier(random_state=42)


random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist,
    n_iter=10,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)

random_search.fit(X_train, y_train)
best_params = random_search.best_params_
print("🔥 Best Hyperparameters:", best_params)


best_rf_model = RandomForestClassifier(random_state=42, **best_params)
best_rf_model.fit(X_train, y_train)


y_pred = best_rf_model.predict(X_test)

print("\n✅ Model Evaluation:")
print("📊 Accuracy:", accuracy_score(y_test, y_pred))
print("\n🔍 Classification Report:\n", classification_report(y_test, y_pred))
print("\n📌 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

🔥 Best Hyperparameters: {'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': None}

✅ Model Evaluation:
📊 Accuracy: 0.9997795239481088

🔍 Classification Report:
               precision    recall  f1-score   support

         1.0       1.00      1.00      1.00     37797
         2.0       1.00      1.00      1.00     37797
         3.0       1.00      1.00      1.00     37797
         4.0       1.00      1.00      1.00     37797
         5.0       1.00      1.00      1.00     37797
         6.0       1.00      1.00      1.00     37797

    accuracy                           1.00    226782
   macro avg       1.00      1.00      1.00    226782
weighted avg       1.00      1.00      1.00    226782


📌 Confusion Matrix:
 [[37786     1     1     8     0     1]
 [    1 37791     0     1     3     1]
 [    1     0 37793     2     1     0]
 [    0     2     2 37790     2     1]
 [    0     6     3     6 37781     1]
 [    0     4     0     0     2 37791]]


In [None]:

import joblib
from sklearn.ensemble import RandomForestClassifier


best_rf_model = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
best_rf_model.fit(X_train, y_train)


joblib.dump(best_rf_model, "random_forest_model.pkl")
joblib.dump(scaler, "scaler.pkl")

print("✅ Model training complete and saved successfully!")


✅ Model training complete and saved successfully!


In [None]:

import pandas as pd
import joblib


test_df = pd.read_csv("test.csv")


test_df.fillna(test_df.median(numeric_only=True), inplace=True)


best_rf_model = joblib.load("random_forest_model.pkl")
scaler = joblib.load("scaler.pkl")

for col in test_df.select_dtypes(include=['object']).columns:
    try:
        test_df[col] = pd.to_numeric(test_df[col], errors='coerce')
    except Exception as e:
        print(f"Error in column {col}: {e}")


test_df.fillna(test_df.median(numeric_only=True), inplace=True)


missing_cols = set(X.columns) - set(test_df.columns)
for col in missing_cols:
    test_df[col] = 0


test_df = test_df[X.columns]


X_test_final = scaler.transform(test_df)


y_pred_test = best_rf_model.predict(X_test_final)


test_df['Predicted_Cluster_Category'] = y_pred_test
test_df.to_csv("test_with_predictions.csv", index=False)


print("\n✅ Predictions saved to 'test_with_predictions.csv'")
print("\n🔹 First 5 Predictions:", y_pred_test[:5])
print("\n🔹 Predicted Class Distribution:\n", pd.Series(y_pred_test).value_counts())



✅ Predictions saved to 'test_with_predictions.csv'

🔹 First 5 Predictions: [3. 3. 3. 3. 3.]

🔹 Predicted Class Distribution:
 3.0    40749
Name: count, dtype: int64


In [None]:
from flask import Flask, request, jsonify
import joblib
import pandas as pd


model = joblib.load("random_forest_model.pkl")
scaler = joblib.load("scaler.pkl")


app = Flask(__name__)

@app.route("/")
def home():
    return "RF Model API is running!"

@app.route("/predict", methods=["POST"])
def predict():
    try:

        data = request.get_json()


        df = pd.DataFrame(data)


        expected_features = model.feature_names_in_
        for col in expected_features:
            if col not in df:
                df[col] = 0


        X_scaled = scaler.transform(df[expected_features])


        predictions = model.predict(X_scaled)


        return jsonify({"predictions": predictions.tolist()})

    except Exception as e:
        return jsonify({"error": str(e)})

if __name__ == "__main__":
    app.run(debug=True)


 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m
INFO:werkzeug: * Restarting with stat
