# Mod√®le de Pr√©diction des Pertes par Utilisateur

Ce notebook contient le mod√®le qui pr√©dit les pertes financi√®res par utilisateur lors d'une cyberattaque.

## 1. Import des biblioth√®ques

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

## 2. Chargement des donn√©es

In [None]:
print("üìÇ Chargement des donn√©es...")
data = pd.read_csv('kaggle_data.csv')
print(f"Donn√©es charg√©es: {data.shape[0]} lignes, {data.shape[1]} colonnes\n")

## 3. Feature Engineering

### 3.1 Calcul de la variable cible

In [None]:
# Calcul des pertes par utilisateur
data['Loss_per_User'] = (data['Financial Loss (in Million $)'] * 1_000_000) / data['Number of Affected Users']
print(f"Pertes moyennes par utilisateur: {data['Loss_per_User'].mean():.2f} $")

### 3.2 Encodage des variables cat√©gorielles

In [None]:
# Encodage des variables cat√©gorielles
encoders = {}
categorical_features = ['Attack Type', 'Target Industry', 'Attack Source', 
                       'Security Vulnerability Type', 'Defense Mechanism Used']
for feature in categorical_features:
    encoders[feature] = LabelEncoder()
    data[feature] = encoders[feature].fit_transform(data[feature])

### 3.3 Cr√©ation des features agr√©g√©es

In [None]:
# Calcul des moyennes par groupe
loss_per_user_by_attack = data.groupby('Attack Type')['Loss_per_User'].mean()
loss_per_user_by_industry = data.groupby('Target Industry')['Loss_per_User'].mean()
loss_per_user_by_vulnerability = data.groupby('Security Vulnerability Type')['Loss_per_User'].mean()

# Cr√©ation des features de moyenne
data['Avg_Loss_by_Attack'] = data['Attack Type'].map(loss_per_user_by_attack)
data['Avg_Loss_by_Industry'] = data['Target Industry'].map(loss_per_user_by_industry)
data['Avg_Loss_by_Vulnerability'] = data['Security Vulnerability Type'].map(loss_per_user_by_vulnerability)

### 3.4 Normalisation des features num√©riques

In [None]:
# Features num√©riques
numerical_features = ['Number of Affected Users', 'Incident Resolution Time (in Hours)',
                     'Avg_Loss_by_Attack', 'Avg_Loss_by_Industry', 'Avg_Loss_by_Vulnerability']

# Normalisation
scaler = StandardScaler()
data[numerical_features] = scaler.fit_transform(data[numerical_features])

### 3.5 Cr√©ation des features d'interaction

In [None]:
# Features d'interaction
data['Resolution_per_User'] = data['Incident Resolution Time (in Hours)'] / data['Number of Affected Users']
data['Attack_Defense'] = data['Attack Type'] * data['Defense Mechanism Used']
data['Industry_Vulnerability'] = data['Target Industry'] * data['Security Vulnerability Type']
data['Time_Factor'] = (2024 - data['Year']) / 10

## 4. Pr√©paration des donn√©es pour l'entra√Ænement

In [None]:
# S√©lection des features
selected_features = (numerical_features + categorical_features + 
                    ['Resolution_per_User', 'Attack_Defense', 'Industry_Vulnerability', 'Time_Factor'])

# Pr√©paration des donn√©es
X = data[selected_features]
y = data['Loss_per_User']

# Split des donn√©es
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## 5. Entra√Ænement du mod√®le

In [None]:
print("ü§ñ Entra√Ænement du mod√®le...")
model = RandomForestRegressor(
    n_estimators=300,
    max_depth=12,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    bootstrap=True,
    random_state=42,
    n_jobs=-1
)

# Validation crois√©e
print("\nüìä Validation crois√©e...")
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='r2')
print(f"Scores R¬≤ (validation crois√©e): {cv_scores.mean():.3f} (¬±{cv_scores.std()*2:.3f})")

# Entra√Ænement final
model.fit(X_train, y_train)

## 6. √âvaluation du mod√®le

In [None]:
# √âvaluation
print("\nüìà √âvaluation du mod√®le...")
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nM√©triques de performance:")
print(f"RMSE: {rmse:.2f} $ par utilisateur")
print(f"MAE: {mae:.2f} $ par utilisateur")
print(f"R¬≤: {r2:.3f}")

## 7. Analyse des features importantes

In [None]:
# Importance des features
feature_importance = pd.DataFrame({
    'feature': selected_features,
    'importance': model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)
print("\nüîç Top 10 features les plus importantes:")
print(feature_importance.head(10))

## 8. Test de pr√©diction

In [None]:
# Test de pr√©diction
print("\nüí° Test de pr√©diction...")
example = X_test.iloc[0].copy()
prediction = model.predict([example])[0]

# Calcul de l'intervalle de confiance
tree_predictions = np.array([tree.predict([example])[0] 
                           for tree in model.estimators_])
confidence_interval = np.percentile(tree_predictions, [5, 95])

print(f"\nüí∞ Perte par utilisateur pr√©dite: {prediction:.2f} $")
print(f"Intervalle de confiance (90%): [{confidence_interval[0]:.2f}, {confidence_interval[1]:.2f}] $")

## 9. Sauvegarde du mod√®le

In [None]:
# Sauvegarde du mod√®le
print("\nüíæ Sauvegarde du mod√®le...")
model_info = {
    'model': model,
    'encoders': encoders,
    'scaler': scaler,
    'selected_features': selected_features,
    'loss_per_user_mappings': {
        'attack': loss_per_user_by_attack,
        'industry': loss_per_user_by_industry,
        'vulnerability': loss_per_user_by_vulnerability
    },
    'feature_importance': feature_importance.to_dict(),
    'metrics': {
        'rmse': rmse,
        'mae': mae,
        'r2': r2,
        'cv_scores': {
            'mean': cv_scores.mean(),
            'std': cv_scores.std()
        }
    }
}
joblib.dump(model_info, 'loss_per_user_model.joblib')
print("‚úÖ Mod√®le sauvegard√©!")