In [53]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC


In [54]:
df=pd.read_csv('C:\\Users\\pavan\\OneDrive\\Desktop\\AI_Assisted Business Performance And Risk Prediction System\\data\\processed\\clean_business_data.csv')
df.head(10)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Revenue,Month,CustomerType
0,536365,85123.0,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom,15.3,2010-12,Repeat
1,536365,71053.0,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34,2010-12,Repeat
2,536365,84406.0,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom,22.0,2010-12,Repeat
3,536365,84029.0,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34,2010-12,Repeat
4,536365,84029.0,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom,20.34,2010-12,Repeat
5,536365,22752.0,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:26:00,7.65,17850.0,United Kingdom,15.3,2010-12,Repeat
6,536365,21730.0,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.25,17850.0,United Kingdom,25.5,2010-12,Repeat
7,536366,22633.0,HAND WARMER UNION JACK,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom,11.1,2010-12,Repeat
8,536366,22632.0,HAND WARMER RED POLKA DOT,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom,11.1,2010-12,Repeat
9,536367,84879.0,ASSORTED COLOUR BIRD ORNAMENT,32,2010-12-01 08:34:00,1.69,13047.0,United Kingdom,54.08,2010-12,Repeat


In [55]:
product_data = (
    df.groupby("StockCode")
    .agg(
        Total_Revenue=("Revenue", "sum"),
        Total_Units=("Quantity", "sum"),
        Order_Count=("InvoiceNo", "nunique")
    )
    .reset_index()
)


In [56]:
#Creating Target Variable
median_revenue = product_data["Total_Revenue"].median()

product_data["Risk_Label"] = (
    product_data["Total_Revenue"] < median_revenue
).astype(int)


In [57]:
product_data['Risk_Label'].value_counts()

Risk_Label
0    1597
1    1597
Name: count, dtype: int64

In [58]:
#Input and Output Features
X = product_data[["Total_Units", "Order_Count"]]
y = product_data["Risk_Label"]

In [59]:
#Train_test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

In [60]:
from sklearn.preprocessing import StandardScaler
scalar=StandardScaler()
X_train_scaled=scalar.fit_transform(X_train)
X_test_scaled=scalar.transform(X_test)

**MODEL 1: Logistic Regression**

In [61]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score

log_reg=LogisticRegression()
log_reg.fit(X_train_scaled, y_train)

y_pred_lr=log_reg.predict(X_test_scaled)


acc_lr=accuracy_score(y_test, y_pred_lr)
f1_lr=f1_score(y_test, y_pred_lr)

**MODEL 2: Decision Tree**

In [62]:
from sklearn.tree import DecisionTreeClassifier

dt=DecisionTreeClassifier()
dt.fit(X_train_scaled, y_train)

y_pred_dt=dt.predict(X_test_scaled)

acc_dt=accuracy_score(y_test, y_pred_dt)
f1_dt=f1_score(y_test, y_pred_dt)

**MODEL 3: Random Forest**

In [63]:
from sklearn.ensemble import RandomForestClassifier

rf=RandomForestClassifier()
rf.fit(X_train_scaled, y_train)

y_pred_rf=rf.predict(X_test_scaled)

acc_rf=accuracy_score(y_test, y_pred_rf)
f1_rf=f1_score(y_test, y_pred_rf)

**MODEL 4: Gradient Boosting**

In [64]:
from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)

y_pred_gb = gb.predict(X_test)

acc_gb = accuracy_score(y_test, y_pred_gb)
f1_gb = f1_score(y_test, y_pred_gb)

**MODEL 5: Support Vector Machine (SVM)**

In [65]:
from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train_scaled, y_train)

y_pred_svm = svm.predict(X_test_scaled)

acc_svm = accuracy_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm)


**FINAL COMPARISON TABLE**

In [66]:
results_df = pd.DataFrame({
    "Model": [
        "Logistic Regression",
        "Decision Tree",
        "Random Forest",
        "Gradient Boosting",
        "SVM"
    ],
    "Accuracy": [
        acc_lr, acc_dt, acc_rf, acc_gb, acc_svm
    ],
    "F1_Score": [
        f1_lr, f1_dt, f1_rf, f1_gb, f1_svm
    ]
})

results_df.sort_values(by="F1_Score", ascending=False)


Unnamed: 0,Model,Accuracy,F1_Score
4,SVM,0.882169,0.885512
0,Logistic Regression,0.872784,0.877756
3,Gradient Boosting,0.868613,0.871951
2,Random Forest,0.860271,0.862986
1,Decision Tree,0.836288,0.838311


<small>

**Observations:**
- SVM achieved the highest Accuracy and F1-Score  
- Logistic Regression performed strongly as a baseline  
- Ensemble models captured non-linear patterns well  

**Model Selection:**
- F1-Score prioritized to balance false positives/negatives  
- Final choice depends on trade-off between performance and interpretability  

</small>


**FINAL MODEL TRAINING & PREDICTIONS (DEPLOYMENT STEP)**

In [67]:
from sklearn.svm import SVC

final_model = SVC()
final_model.fit(X_train_scaled, y_train)

product_data["Predicted_Risk"] = final_model.predict(
    scaler.transform(product_data[["Total_Units", "Order_Count"]])
)


In [68]:
# Mapping numeric labels to business-friendly categories
product_data["Risk_Category"] = product_data["Predicted_Risk"].map({
    1: "High Risk",
    0: "Low Risk"
})

In [69]:
# Quick validation
product_data[["Predicted_Risk", "Risk_Category"]].value_counts()

Predicted_Risk  Risk_Category
1               High Risk        1656
0               Low Risk         1538
Name: count, dtype: int64

In [70]:
product_data.to_csv(
    "C:\\Users\\pavan\\OneDrive\\Desktop\\AI_Assisted Business Performance And Risk Prediction System\\data\\processed\\product_risk_predictions.csv",
    index=False
)