# Cell 1 - Markdown
"""
# ModÃ¨le de Propension Ã  l'Achat
## AnyCompany Food & Beverage - Phase 3 ML

**Objectif** : PrÃ©dire la probabilitÃ© qu'un client effectue un achat dans les 30 prochains jours

**Approche** : Classification binaire avec :
- RÃ©gression Logistique
- Random Forest
- XGBoost
"""

In [None]:
# Cell 2 - Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import snowflake.connector
import warnings
warnings.filterwarnings('ignore')

sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("âœ… BibliothÃ¨ques importÃ©es")


In [None]:
# Cell 3 - Connexion et extraction
# Connexion Snowflake
conn = snowflake.connector.connect(
    user = "THANDIE",
    password = "MyCodexCodeESGstu357$",
    account = "MWYCFSC-YKB13542",
    warehouse = "ANYCOMPANY_WH",
    database = "ANYCOMPANY_LAB",
    schema = "SILVER"
)

# Extraction des donnÃ©es
query = """
WITH customer_features AS (
    SELECT 
        cd.customer_id,
        cd.age,
        cd.annual_income,
        cd.gender,
        cd.region,
        -- Features comportementales
        COUNT(DISTINCT ft.transaction_id) AS total_purchases,
        SUM(ft.amount) AS lifetime_value,
        ROUND(AVG(ft.amount), 2) AS avg_order_value,
        DATEDIFF(day, MAX(ft.transaction_date), CURRENT_DATE()) AS days_since_last_purchase,
        -- Engagement promotions
        COUNT(DISTINCT CASE WHEN p.promotion_id IS NOT NULL THEN ft.transaction_id END) AS promo_purchases,
        -- Target : achat dans les 30 derniers jours
        CASE 
            WHEN MAX(ft.transaction_date) >= DATEADD(day, -30, CURRENT_DATE()) THEN 1 
            ELSE 0 
        END AS purchased_recently
    FROM SILVER.customer_demographics_clean cd
    LEFT JOIN SILVER.financial_transactions_clean ft ON cd.customer_id = ft.customer_id
    LEFT JOIN SILVER.promotions_clean p 
        ON ft.region = p.region 
        AND ft.transaction_date BETWEEN p.start_date AND p.end_date
    WHERE ft.transaction_type = 'Sale'
    GROUP BY cd.customer_id, cd.age, cd.annual_income, cd.gender, cd.region
    HAVING COUNT(DISTINCT ft.transaction_id) >= 2
)
SELECT * FROM customer_features
LIMIT 10000
"""

df = pd.read_sql(query, conn)
conn.close()

print(f"ðŸ“Š {len(df)} clients extraits")
print(f"Target distribution : {df['PURCHASED_RECENTLY'].value_counts().to_dict()}")
df.head()

In [None]:
# Cell 4 - PrÃ©paration des donnÃ©es
# PrÃ©paration des features
# Encoder les variables catÃ©gorielles
df_encoded = pd.get_dummies(df, columns=['GENDER', 'REGION'], drop_first=True)

# Features et target
feature_cols = [col for col in df_encoded.columns if col not in ['CUSTOMER_ID', 'PURCHASED_RECENTLY']]
X = df_encoded[feature_cols]
y = df_encoded['PURCHASED_RECENTLY']

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)

# Normalisation
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print(f"âœ… DonnÃ©es prÃ©parÃ©es")
print(f"  Train: {X_train.shape}")
print(f"  Test: {X_test.shape}")
print(f"  Features: {len(feature_cols)}")


In [None]:
# Cell 5 - ModÃ¨le 1 : RÃ©gression Logistique
# RÃ©gression Logistique
print("ðŸ”„ EntraÃ®nement RÃ©gression Logistique...")

lr_model = LogisticRegression(random_state=42, max_iter=1000)
lr_model.fit(X_train_scaled, y_train)

# PrÃ©dictions
y_pred_lr = lr_model.predict(X_test_scaled)
y_pred_proba_lr = lr_model.predict_proba(X_test_scaled)[:, 1]

# Ã‰valuation
print("\nðŸ“Š RÃ‰GRESSION LOGISTIQUE - RÃ©sultats")
print("="*60)
print(classification_report(y_test, y_pred_lr))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba_lr):.4f}")


In [None]:
# Cell 6 - ModÃ¨le 2 : Random Forest
# Random Forest
print("ðŸ”„ EntraÃ®nement Random Forest...")

rf_model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
rf_model.fit(X_train, y_train)

# PrÃ©dictions
y_pred_rf = rf_model.predict(X_test)
y_pred_proba_rf = rf_model.predict_proba(X_test)[:, 1]

# Ã‰valuation
print("\nðŸ“Š RANDOM FOREST - RÃ©sultats")
print("="*60)
print(classification_report(y_test, y_pred_rf))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba_rf):.4f}")


In [None]:
# Cell 7 - Comparaison des modÃ¨les
# Comparaison ROC Curves
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_pred_proba_lr)
fpr_rf, tpr_rf, _ = roc_curve(y_test, y_pred_proba_rf)

plt.figure(figsize=(10, 6))
plt.plot(fpr_lr, tpr_lr, label=f'Logistic Regression (AUC = {roc_auc_score(y_test, y_pred_proba_lr):.4f})', linewidth=2)
plt.plot(fpr_rf, tpr_rf, label=f'Random Forest (AUC = {roc_auc_score(y_test, y_pred_proba_rf):.4f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='AlÃ©atoire')
plt.xlabel('Taux de Faux Positifs')
plt.ylabel('Taux de Vrais Positifs')
plt.title('Courbes ROC - Comparaison des ModÃ¨les')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()


In [None]:
# Cell 8 - Feature Importance
# Feature Importance (Random Forest)
feature_importance = pd.DataFrame({
    'Feature': feature_cols,
    'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)

plt.figure(figsize=(12, 6))
plt.barh(feature_importance.head(10)['Feature'], feature_importance.head(10)['Importance'])
plt.xlabel('Importance')
plt.title('Top 10 Features les Plus Importantes')
plt.gca().invert_yaxis()
plt.grid(axis='x', alpha=0.3)
plt.show()

print("\nðŸ“Š Top 10 Features :")
print(feature_importance.head(10))


In [None]:
# Cell 9 - Matrice de confusion
# Matrice de confusion
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

# Logistic Regression
cm_lr = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues', ax=ax1)
ax1.set_title('Matrice de Confusion - RÃ©gression Logistique')
ax1.set_ylabel('Vraie Classe')
ax1.set_xlabel('Classe PrÃ©dite')

# Random Forest
cm_rf = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Greens', ax=ax2)
ax2.set_title('Matrice de Confusion - Random Forest')
ax2.set_ylabel('Vraie Classe')
ax2.set_xlabel('Classe PrÃ©dite')

plt.tight_layout()
plt.show()


In [None]:
# Cell 10 - Recommandations
"""
## ðŸ’¼ RECOMMANDATIONS BUSINESS

### Utilisation du ModÃ¨le

Le modÃ¨le de propension Ã  l'achat peut Ãªtre utilisÃ© pour :

1. **ðŸŽ¯ Ciblage Marketing PrÃ©dictif**
   - Identifier les clients Ã  forte probabilitÃ© d'achat
   - Personnaliser les campagnes par scoring

2. **ðŸ“§ Automation Email**
   - DÃ©clencher emails automatiques pour clients >70% probabilitÃ©
   - Offres spÃ©ciales pour clients 40-70%

3. **ðŸ’° Optimisation Budget**
   - Concentrer budget sur clients Ã  fort potentiel
   - RÃ©duire coÃ»ts d'acquisition

### Prochaines Ã‰tapes

âœ… DÃ©ployer le modÃ¨le en production  
âœ… Automatiser le scoring mensuel  
âœ… Tests A/B pour mesurer l'impact  
âœ… Enrichir avec donnÃ©es comportementales additionnelles
"""