In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [4]:
# Load the dataset
file_path = "/mnt/data/Ecommerce.csv"
df = pd.read_csv(r"D:\Masai\Elevate\Solo Project\Ecommerce.csv")

In [5]:
# Convert InvoiceDate to datetime
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])

# Sort data by CustomerID and InvoiceDate
df = df.sort_values(by=["CustomerID", "InvoiceDate"])

In [6]:
# Create target variable: Repeat Purchase within 30 days
df["NextPurchaseDate"] = df.groupby("CustomerID")["InvoiceDate"].shift(-1)
df["DaysBetween"] = (df["NextPurchaseDate"] - df["InvoiceDate"]).dt.days
df["RepeatPurchase"] = (df["DaysBetween"] <= 30).astype(int)
df = df.drop(columns=["NextPurchaseDate", "DaysBetween"])

In [7]:
# Feature Engineering
df["Recency"] = df.groupby("CustomerID")["InvoiceDate"].transform(lambda x: (x.max() - x).dt.days)
df["Frequency"] = df.groupby("CustomerID")["InvoiceNo"].transform("nunique")
df["MonetaryValue"] = df.groupby("CustomerID")["Total sales"].transform("sum")
df["AvgBasketSize"] = df.groupby("CustomerID")["Quantity"].transform("mean")
df["LastPurchaseAmount"] = df.groupby("CustomerID")["Total sales"].transform("last")

In [8]:
# Selecting relevant features
features = ["Recency", "Frequency", "MonetaryValue", "AvgBasketSize", "LastPurchaseAmount"]
target = "RepeatPurchase"
X = df[features]
y = df[target]

In [9]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
# Initialize models
log_reg = LogisticRegression(max_iter=1000, random_state=42)
rf = RandomForestClassifier(n_estimators=50, random_state=42)

In [11]:
# Train models
log_reg.fit(X_train, y_train)
rf.fit(X_train, y_train)

In [12]:
# Make predictions
y_pred_log = log_reg.predict(X_test)
y_pred_rf = rf.predict(X_test)

In [13]:
# Evaluation function
def evaluate_model(y_true, y_pred, model_name):
    return {
        "Model": model_name,
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred),
        "Recall": recall_score(y_true, y_pred),
        "F1-Score": f1_score(y_true, y_pred),
        "ROC-AUC": roc_auc_score(y_true, y_pred)
    }

In [14]:
# Evaluate models
results_df = pd.DataFrame([
    evaluate_model(y_test, y_pred_log, "Logistic Regression"),
    evaluate_model(y_test, y_pred_rf, "Random Forest")
])

# Display results
print(results_df)

                 Model  Accuracy  Precision    Recall  F1-Score   ROC-AUC
0  Logistic Regression  0.979417   0.979516  0.999895  0.989601  0.501945
1        Random Forest  0.978524   0.979764  0.998700  0.989141  0.508087


In [15]:
# Evaluate models
results_df = pd.DataFrame([
    evaluate_model(y_test, y_pred_log, "Logistic Regression"),
    evaluate_model(y_test, y_pred_rf, "Random Forest")
])

# Display results
print(results_df)

# Model Comparison
print("\nComparison between Logistic Regression and Random Forest:\n")
print(results_df.sort_values(by='Accuracy', ascending=False))


                 Model  Accuracy  Precision    Recall  F1-Score   ROC-AUC
0  Logistic Regression  0.979417   0.979516  0.999895  0.989601  0.501945
1        Random Forest  0.978524   0.979764  0.998700  0.989141  0.508087

Comparison between Logistic Regression and Random Forest:

                 Model  Accuracy  Precision    Recall  F1-Score   ROC-AUC
0  Logistic Regression  0.979417   0.979516  0.999895  0.989601  0.501945
1        Random Forest  0.978524   0.979764  0.998700  0.989141  0.508087
