In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score


df = pd.read_csv('cyrene_final_master_for_pbi.csv', sep=';', decimal=',')
df['ds'] = pd.to_datetime(df['ds'])


b2c_label = 'B2C_Individual' if 'B2C_Individual' in df['Customer_Type'].unique() else 'Individual'
df_b2c = df[df['Customer_Type'] == b2c_label].copy()


snapshot_date = df_b2c['ds'].max()

# Müşteri bazlı davranış özeti (Sadece saf davranışlar)
cust_base = df_b2c.groupby('Customer_Key').agg({
    'ds': lambda x: (snapshot_date - x.max()).days, # Recency: Son gelişinden beri geçen gün
    'Order_Quantity': 'count',                      # Frequency: Alışveriş sıklığı
    'y': 'sum',                                     # Monetary (Sadece Target oluşturmak için)
    'Sub_Category': 'nunique'                       # Diversity: Ürün çeşitliliği (Kaç farklı kategori?)
}).rename(columns={'ds': 'Recency', 'Order_Quantity': 'Frequency', 'y': 'Monetary', 'Sub_Category': 'Diversity'})

# İndirim Hassasiyeti 
df_b2c['Discount_Amt'] = df_b2c['List_Price'] - df_b2c['Unit_Price']
df_b2c['Discount_Ratio'] = df_b2c['Discount_Amt'] / df_b2c.apply(lambda x: x['List_Price'] if x['List_Price'] > 0 else 1, axis=1)
discount_features = df_b2c.groupby('Customer_Key')['Discount_Ratio'].mean().reset_index().rename(columns={'Discount_Ratio': 'Avg_Discount'})

# Haftasonu Eğilimi
df_b2c['Is_Weekend'] = df_b2c['Day_Name'].isin(['Saturday', 'Sunday']).astype(int)
weekend_pref = df_b2c.groupby('Customer_Key')['Is_Weekend'].mean().reset_index().rename(columns={'Is_Weekend': 'Weekend_Rate'})


customer_features = cust_base.reset_index().merge(discount_features, on='Customer_Key').merge(weekend_pref, on='Customer_Key')


threshold = customer_features['Monetary'].quantile(0.70)
customer_features['Is_High_Value'] = (customer_features['Monetary'] > threshold).astype(int)


features = ['Recency', 'Frequency', 'Diversity', 'Avg_Discount', 'Weekend_Rate']
X = customer_features[features].fillna(0)
y = customer_features['Is_High_Value']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


clf = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42, class_weight='balanced')
clf.fit(X_train, y_train)

y_prob = clf.predict_proba(X_test)[:, 1]
print(f" Yeni Model Başarı Skoru (ROC-AUC): {roc_auc_score(y_test, y_prob):.4f}")
print("\n Karar Faktörleri (Şimdi Daha Mantıklı):\n", pd.Series(clf.feature_importances_, index=features).sort_values(ascending=False))

customer_features['Value_Probability'] = clf.predict_proba(X)[:, 1]
customer_features['Value_Prediction'] = clf.predict(X)
customer_features[['Customer_Key', 'Value_Probability', 'Value_Prediction']].to_csv('b2c_ai_insights_v3.csv', index=False, sep=';', decimal=',')

 Yeni Model Başarı Skoru (ROC-AUC): 0.8622

 Karar Faktörleri (Şimdi Daha Mantıklı):
 Avg_Discount    0.363700
Diversity       0.317153
Frequency       0.165423
Recency         0.116787
Weekend_Rate    0.036937
dtype: float64


In [3]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.preprocessing import StandardScaler


df = pd.read_csv('cyrene_final_master_for_pbi.csv', sep=';', decimal=',')
df['ds'] = pd.to_datetime(df['ds'])


b2c_label = 'B2C_Individual' if 'B2C_Individual' in df['Customer_Type'].unique() else 'Individual'
df_b2c = df[df['Customer_Type'] == b2c_label].copy()


snapshot_date = df_b2c['ds'].max()


cust_base = df_b2c.groupby('Customer_Key').agg({
    'ds': lambda x: (snapshot_date - x.max()).days,  # Recency: Son geliş
    'Order_Quantity': 'count',                       # Frequency: Alışveriş sıklığı
    'y': 'sum',                                      # Monetary: Toplam harcama (Target için)
    'Sub_Category': 'nunique'                        # Diversity: Ürün çeşitliliği
}).rename(columns={'ds': 'Recency', 'Order_Quantity': 'Frequency', 'y': 'Monetary', 'Sub_Category': 'Diversity'})


df_b2c['Discount_Amt'] = df_b2c['List_Price'] - df_b2c['Unit_Price']
df_b2c['Discount_Ratio'] = df_b2c['Discount_Amt'] / df_b2c.apply(lambda x: x['List_Price'] if x['List_Price'] > 0 else 1, axis=1)
discount_features = df_b2c.groupby('Customer_Key')['Discount_Ratio'].mean().reset_index().rename(columns={'Discount_Ratio': 'Avg_Discount'})


df_b2c['Is_Weekend'] = df_b2c['Day_Name'].isin(['Saturday', 'Sunday']).astype(int)
weekend_pref = df_b2c.groupby('Customer_Key')['Is_Weekend'].mean().reset_index().rename(columns={'Is_Weekend': 'Weekend_Rate'})


customer_features = cust_base.reset_index().merge(discount_features, on='Customer_Key').merge(weekend_pref, on='Customer_Key')


threshold = customer_features['Monetary'].quantile(0.70)
customer_features['Is_High_Value'] = (customer_features['Monetary'] > threshold).astype(int)


features = ['Recency', 'Frequency', 'Diversity', 'Avg_Discount', 'Weekend_Rate']
X = customer_features[features].fillna(0)
y = customer_features['Is_High_Value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

def evaluate_model(name, model, X_t, y_t):
    y_pred = model.predict(X_t)
    y_prob = model.predict_proba(X_t)[:, 1]
    return {
        "Model": name,
        "Accuracy": accuracy_score(y_t, y_pred),
        "Precision": precision_score(y_t, y_pred),
        "Recall": recall_score(y_t, y_pred),
        "F1-Score": f1_score(y_t, y_pred),
        "ROC-AUC": roc_auc_score(y_t, y_prob)
    }

results = []


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

log_reg = LogisticRegression(class_weight='balanced', random_state=42)
log_reg.fit(X_train_scaled, y_train)
results.append(evaluate_model("Logistic Regression (Baseline)", log_reg, X_test_scaled, y_test))


gb = GradientBoostingClassifier(random_state=42)
gb.fit(X_train, y_train)
results.append(evaluate_model("Gradient Boosting", gb, X_test, y_test))


rf_base = RandomForestClassifier(class_weight='balanced', random_state=42)
rf_base.fit(X_train, y_train)
results.append(evaluate_model("Random Forest (Base)", rf_base, X_test, y_test))


param_grid = {
    'n_estimators': [100, 150],
    'max_depth': [5, 10],
    'min_samples_leaf': [1, 2]
}
grid = GridSearchCV(RandomForestClassifier(class_weight='balanced', random_state=42), 
                     param_grid, cv=3, scoring='roc_auc')
grid.fit(X_train, y_train)
best_rf = grid.best_estimator_
results.append(evaluate_model("Random Forest (Optimized)", best_rf, X_test, y_test))

print("\n" + "="*60)
print("             MODEL PERFORMANS KIYASLAMASI")
print("="*60)
results_df = pd.DataFrame(results)
print(results_df.to_string(index=False))
print("\nEn İyi Random Forest Parametreleri:", grid.best_params_)




customer_features['Value_Probability'] = best_rf.predict_proba(X)[:, 1]
customer_features['Value_Prediction'] = best_rf.predict(X)


customer_features.to_csv('b2c_ai_final_results.csv', 
                         index=False, 
                         sep=';',      
                         decimal=',')  

print("\n Veriler düzeldi! Tüm kolonlar (RFM + Tahminler) artık doğru formatta.")



             MODEL PERFORMANS KIYASLAMASI
                         Model  Accuracy  Precision   Recall  F1-Score  ROC-AUC
Logistic Regression (Baseline)  0.767108   0.604770 0.641373  0.622534 0.814227
             Gradient Boosting  0.833919   0.794504 0.600723  0.684156 0.880135
          Random Forest (Base)  0.781174   0.632562 0.642276  0.637382 0.824677
     Random Forest (Optimized)  0.813903   0.666402 0.757904  0.709214 0.883223

En İyi Random Forest Parametreleri: {'max_depth': 10, 'min_samples_leaf': 2, 'n_estimators': 150}

✅ Veriler düzeldi! Tüm kolonlar (RFM + Tahminler) artık doğru formatta.


In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, accuracy_score, confusion_matrix
import matplotlib.pyplot as plt


df = pd.read_csv('cyrene_final_master_for_pbi.csv', sep=';', decimal=',')
df['ds'] = pd.to_datetime(df['ds'])
b2c_label = 'B2C_Individual' if 'B2C_Individual' in df['Customer_Type'].unique() else 'Individual'
df_b2c = df[df['Customer_Type'] == b2c_label].copy()

snapshot_date = df_b2c['ds'].max()
cust_features = df_b2c.groupby('Customer_Key').agg({
    'ds': lambda x: (snapshot_date - x.max()).days,
    'Order_Quantity': 'count',
    'y': 'sum',
    'Sub_Category': 'nunique'
}).rename(columns={'ds': 'Recency', 'Order_Quantity': 'Frequency', 'y': 'Monetary', 'Sub_Category': 'Diversity'})

df_b2c['Discount_Amt'] = df_b2c['List_Price'] - df_b2c['Unit_Price']
df_b2c['Discount_Ratio'] = df_b2c['Discount_Amt'] / df_b2c.apply(lambda x: x['List_Price'] if x['List_Price'] > 0 else 1, axis=1)
discount = df_b2c.groupby('Customer_Key')['Discount_Ratio'].mean().reset_index().rename(columns={'Discount_Ratio': 'Avg_Discount'})

customer_features = cust_features.reset_index().merge(discount, on='Customer_Key')


threshold_val = customer_features['Monetary'].quantile(0.70)
customer_features['Is_High_Value'] = (customer_features['Monetary'] > threshold_val).astype(int)


features = ['Recency', 'Frequency', 'Diversity', 'Avg_Discount']
X = customer_features[features].fillna(0)
y = customer_features['Is_High_Value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


model = RandomForestClassifier(n_estimators=150, max_depth=10, min_samples_leaf=2, class_weight='balanced', random_state=42)
model.fit(X_train, y_train)


y_prob = model.predict_proba(X_test)[:, 1]

thresholds = [0.3, 0.4, 0.5, 0.6, 0.7]

print(f"{'Eşik':<10} | {'Precision':<10} | {'Recall':<10} | {'Accuracy':<10}")
print("-" * 50)

results_list = []
for t in thresholds:
    y_pred = (y_prob >= t).astype(int)
    p = precision_score(y_test, y_pred)
    r = recall_score(y_test, y_pred)
    a = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    print(f"{t:<10} | {p:<10.4f} | {r:<10.4f} | {a:<10.4f}")
    results_list.append({'Threshold': t, 'Precision': p, 'Recall': r, 'CM': cm})



Eşik       | Precision  | Recall     | Accuracy  
--------------------------------------------------
0.3        | 0.4803     | 0.8799     | 0.6789    
0.4        | 0.5450     | 0.7986     | 0.7401    
0.5        | 0.6395     | 0.7082     | 0.7931    
0.6        | 0.7289     | 0.6242     | 0.8180    
0.7        | 0.7892     | 0.5411     | 0.8193    
