In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
df = pd.read_csv("C:/Users/User/Downloads/counterfeit_products_renamed.csv")
df.shape

(5000, 27)

In [3]:
df.head(10)

Unnamed: 0,item_ref,vendor_code,product_type,manufacturer,cost_usd,vendor_score,feedback_count,image_qty,desc_chars,delivery_period,...,dispatch_loc,page_hits,sales_vol,saved_items,trust_seals,guarantee_term,wholesale_avail,payment_anomaly,geo_inconsistency,fraud_indicator
0,dcb12716,seller_5426,Electronics,DigitalPro,218.54,4.9,5291,7,220,7,...,JP,3454,99,23,2,21,True,False,True,False
1,d1f8b777,seller_4385,Luxury Goods,TechCrop,,3.2,29,2,179,41,...,CA,4127,252,88,0,9,False,False,False,True
2,9989f13a,seller_2528,Fashion,FashionForward,18.87,3.9,14,2,100,25,...,CN,3212,306,6,4,8,True,True,False,True
3,4e008120,seller_8099,Electronics,CircuitMaster,79.27,4.9,8533,4,976,10,...,CN,1488,7,46,2,13,False,False,False,False
4,933b65ba,seller_7873,Luxury Goods,PremiumCraft,225.98,3.8,7655,4,589,6,...,DE,171,9,5,3,13,False,False,False,False
5,52a410fa,seller_7863,Pharmaceuticals,BioMed Solutions,,4.0,7729,9,451,9,...,,347,23,12,0,14,False,False,False,False
6,59e1a27b,seller_6618,Pharmaceuticals,BeautyLu×,,1.0,8,2,178,25,...,GB,401,34,2,0,4,False,False,False,True
7,b8fa970d,seller_3454,Electronics,NaturalGl0w,186.0,3.0,27,1,121,25,...,FR,3655,215,36,2,11,False,False,False,True
8,3c4f5456,seller_1502,Fashion,TrendSetter,336.9,4.8,5156,5,812,3,...,DE,4503,358,26,0,14,False,False,False,False
9,f4068089,seller_7019,Fashion,ClassicWear,242.39,3.5,1584,5,774,7,...,IN,1767,150,14,2,3,False,False,,False


In [4]:
df.isnull().sum()

item_ref                0
vendor_code             0
product_type            0
manufacturer            0
cost_usd             1348
vendor_score            0
feedback_count          0
image_qty               0
desc_chars              0
delivery_period         0
typo_count              0
site_age                0
contact_valid           0
return_clarity          0
payment_options         0
post_timestamp          0
vendor_nation           0
dispatch_loc          747
page_hits               0
sales_vol               0
saved_items             0
trust_seals             0
guarantee_term          0
wholesale_avail         0
payment_anomaly         0
geo_inconsistency    1249
fraud_indicator         0
dtype: int64

In [6]:
for col in df.columns:
    if df[col].dtype == 'object':
        df[col] = df[col].fillna(df[col].mode()[0])   
    else:
        df[col] = df[col].fillna(df[col].median()) 
print("Total missing values after cleaning:", df.isnull().sum().sum())

Total missing values after cleaning: 0


In [7]:
le = LabelEncoder()
for col in df.select_dtypes(include=['object']).columns:
    df[col] = le.fit_transform(df[col])
df.head()

Unnamed: 0,item_ref,vendor_code,product_type,manufacturer,cost_usd,vendor_score,feedback_count,image_qty,desc_chars,delivery_period,...,dispatch_loc,page_hits,sales_vol,saved_items,trust_seals,guarantee_term,wholesale_avail,payment_anomaly,geo_inconsistency,fraud_indicator
0,4275,1849,2,10,218.54,4.9,5291,7,220,7,...,7,3454,99,23,2,21,True,False,True,False
1,4075,1409,4,31,191.815,3.2,29,2,179,41,...,1,4127,252,88,0,9,False,False,False,True
2,3009,652,3,15,18.87,3.9,14,2,100,25,...,2,3212,306,6,4,8,True,True,False,True
3,1540,3004,2,5,79.27,4.9,8533,4,976,10,...,2,1488,7,46,2,13,False,False,False,False
4,2886,2912,4,25,225.98,3.8,7655,4,589,6,...,3,171,9,5,3,13,False,False,False,False


In [8]:
X = df.drop('fraud_indicator', axis=1)
y = df['fraud_indicator']
X.shape
y.shape

(5000,)

In [9]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=y_temp)

In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [11]:
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)
y_val_pred_lr = log_reg.predict(X_val_scaled)
print("Accuracy:", accuracy_score(y_val, y_val_pred_lr))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred_lr))
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_val_pred_lr))

Accuracy: 1.0

Classification Report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       529
        True       1.00      1.00      1.00       221

    accuracy                           1.00       750
   macro avg       1.00      1.00      1.00       750
weighted avg       1.00      1.00      1.00       750


Confusion Matrix:
[[529   0]
 [  0 221]]


In [12]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("RAccuracy:", accuracy_score(y_test, y_pred_rf))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))

Accuracy: 1.0

Classification Report:
              precision    recall  f1-score   support

       False       1.00      1.00      1.00       529
        True       1.00      1.00      1.00       221

    accuracy                           1.00       750
   macro avg       1.00      1.00      1.00       750
weighted avg       1.00      1.00      1.00       750


Confusion Matrix:
[[529   0]
 [  0 221]]


In [13]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
log_cv_scores = cross_val_score(log_reg, X, y, cv=cv, scoring="accuracy")
rf_cv_scores = cross_val_score(rf, X, y, cv=cv, scoring="accuracy")
print("LR CV Accuracy: %.3f ± %.3f" % (log_cv_scores.mean(), log_cv_scores.std()))
print("RF CV Accuracy: %.3f ± %.3f" % (rf_cv_scores.mean(), rf_cv_scores.std()))

LR CV Accuracy: 0.999 ± 0.000
RF CV Accuracy: 1.000 ± 0.000


In [16]:
accuracy_lr=accuracy_score(y_test, y_pred_log)
accuracy_rf=accuracy_score(y_test, y_pred_rf)
if accuracy_lr >= accuracy_rf:
    best_model=log_reg
    print("Logistic Regression selected")
else:
    best_model = rf
    print("Random Forest selected")

Logistic Regression selected


In [28]:
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test set accuracy: {test_accuracy:.3f}")
cm = confusion_matrix(y_test, y_test_pred)
print("\nConfusion Matrix:")
print(cm)
print("\nClassification Report:")
print(classification_report(y_test, y_test_pred, digits=3))

Test set accuracy: 1.000

Confusion Matrix:
[[706   0]
 [  0 294]]

Classification Report:
              precision    recall  f1-score   support

       False      1.000     1.000     1.000       706
        True      1.000     1.000     1.000       294

    accuracy                          1.000      1000
   macro avg      1.000     1.000     1.000      1000
weighted avg      1.000     1.000     1.000      1000



In [None]:
import joblib
joblib.dump(best_model, "counterfeit_best_model.joblib")