# Projet de Machine Learning : Détection de la Fraude Mobile Money

# 1. Problem Definition

## Objectif : Identifier les transactions suspectes dans les systèmes de paiement mobile (Orange Money, Inwi Money…)
## Jeu de données : Mobile Money Fraud Detection Dataset (IEEE / Zindi Africa Challenge)
## Source : https://www.kaggle.com/datasets/ealaxi/paysim1

### Importation des librairies

In [3]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
#from google.colab import drive

##2. Data collection

In [4]:
#drive.mount('/content/drive')

# Chargement du dataset (après téléchargement depuis Zindi)
#file_path = '/content/drive/MyDrive/INPT/INE2/MachineLearning/Projet/datasets/paysim_2017.csv'
file_path = '../../paysim_2017.csv'
data = pd.read_csv(file_path)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


## 3. Data preparation

In [5]:
# Nettoyage et préparation des données
data = data.dropna()
data = data.drop_duplicates()

# Encodage des variables catégorielles
data = pd.get_dummies(data, columns=['type'], drop_first=True)

#supression de la variable isFlaggedFraud qui ne nous interesse pas
data.drop('isFlaggedFraud', axis=1)

Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
0,1,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,False,False,True,False
1,1,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,False,False,True,False
2,1,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,False,False,False,True
3,1,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,True,False,False,False
4,1,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,True,False,False,False
6362616,743,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,False,False,False,True
6362617,743,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,True,False,False,False
6362618,743,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,False,False,False,True


## 4. Feature engineering

In [6]:
#vérification de la complétude des données
data.describe()

# Feature engineering simple
data['diffOrig'] = data['oldbalanceOrg'] - data['newbalanceOrig']
data['diffDest'] = data['newbalanceDest'] - data['oldbalanceDest']
data['orig_to_amount'] = data['oldbalanceOrg'] / (data['amount'] + 1)
data['dest_to_amount'] = data['oldbalanceDest'] / (data['amount'] + 1)
data['same_account'] = (data['nameOrig'] == data['nameDest']).astype(int)

#affichage de la nouvelle entête
data.head()



Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER,diffOrig,diffDest,orig_to_amount,dest_to_amount,same_account
0,1,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0,False,False,True,False,9839.64,0.0,17.289119,0.0,0
1,1,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0,False,False,True,False,1864.28,0.0,11.391855,0.0,0
2,1,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0,False,False,False,True,181.0,0.0,0.994505,0.0,0
3,1,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0,True,False,False,False,181.0,-21182.0,0.994505,116.384615,0
4,1,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0,False,False,True,False,11668.14,0.0,3.561016,0.0,0


## 5. Entraînement du modèle de base

In [7]:
#suppression des caractéristiques discrètes pour faciliter le RandomForest
data = data.drop(['nameOrig', 'nameDest'], axis=1)

# Séparation des features et de la cible
X = data.drop('isFraud', axis=1)
y = data['isFraud']

# Division du jeu de données
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270881
           1       1.00      1.00      1.00      1643

    accuracy                           1.00   1272524
   macro avg       1.00      1.00      1.00   1272524
weighted avg       1.00      1.00      1.00   1272524



## 5. Optimisation automatique des hyperparamètres (Optuna)

In [None]:
!pip install optuna
import optuna

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 100, 500)
    max_depth = trial.suggest_int('max_depth', 3, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 5)

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    return accuracy_score(y_test, preds)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1)

print('Best parameters:', study.best_params)




  from .autonotebook import tqdm as notebook_tqdm
[I 2025-11-16 00:47:36,932] A new study created in memory with name: no-name-adef98cf-84e2-4514-b286-c751a978d212


In [None]:
#import json
#X_test.info()
#json_str= X_test.iloc[0].to_json()
#print (json_str)

## 6. Sauvegarde et déploiement du modèle (Flask + Docker)

In [None]:

import joblib
from flask import Flask, request, jsonify

# Sauvegarde du modèle
joblib.dump(rf, 'fraud_model_paysim2.pkl')

# Flask app
app = Flask(__name__)
model = joblib.load('fraud_model_paysim2.pkl')

@app.route('/predict', methods=['POST'])
def predict():
    data = request.get_json(force=True)
    df = pd.DataFrame([data])
    prediction = model.predict(df)
    return jsonify({'prediction': int(prediction[0])})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)


NameError: name 'rf' is not defined

## 7. Test de l’API Flask déployée

In [None]:
#import requests

#url = 'http://127.0.0.1:5000/predict'
test_data = X_test.iloc[0].to_dict()
print (test_data)
#response = requests.post(url, json=test_data)
#print(response.json())


{'step': 300, 'amount': 890577.21, 'oldbalanceOrg': 218.0, 'newbalanceOrig': 0.0, 'oldbalanceDest': 0.0, 'newbalanceDest': 890577.21, 'isFlaggedFraud': 0, 'type_CASH_OUT': False, 'type_DEBIT': False, 'type_PAYMENT': False, 'type_TRANSFER': True, 'diffOrig': 218.0, 'diffDest': 890577.21, 'orig_to_amount': 0.0002447847898726379, 'dest_to_amount': 0.0, 'same_account': 0}


## 8. Dockerfile

In [None]:

# Contenu du Dockerfile
dockerfile = '''
FROM python:3.10-slim
WORKDIR /app
COPY . /app
RUN pip install -r requirements.txt
EXPOSE 5000
CMD ["python", "app.py"]
'''
with open('Dockerfile', 'w') as f:
    f.write(dockerfile)


## 9. requirements.txt

In [None]:

requirements = '''
pandas
numpy
scikit-learn
flask
joblib
optuna
requests
matplotlib
seaborn
'''
with open('requirements.txt', 'w') as f:
    f.write(requirements)
