# SA DATAGEN DATA ANALYSIS POC V2

#### =======================================================================

#### Guide d'exécution du code
##### 1: Créer un environnement virtuel
##### 2: Installer les requirements dans le fichier Requirements.txt
###### 2.1: Activer votre environnement virtuel
###### 2.2: Exécuter la commande ( pip install -r requirements.txt ) 

### ===============================================================

### Requirements à installer pour exécuter le code

In [3]:
!pip install faker





[notice] A new release of pip is available: 25.0.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
import random
from faker import Faker
from datetime import datetime, timedelta
import pandas as pd
import numpy as np

### Initialisation et Constantes

In [5]:
fake = Faker(['fr_FR'])

nombre_patients = 1200  # Increased number of patients

prenoms_male = ["Moussa", "Abdoulaye", "Samba", "Modou", "Lamine", "Bouna", "Sidy", "Pape", "Omar", "Malick"]
prenoms_female = ["Awa", "Mame", "Rokhaya", "Fama", "Ndeye Fatou", "Seynabou", "Bineta", "Maimouna", "Diarra", "Kine"]

noms_senegalais = ["Dieng", "Cisse", "Thiam", "Lo", "Mbaye", "Faye", "Toure", "Ka", "Camara", "Sall"]

list_pathologies = ["Diabète de type 2", "Hypertension artérielle", "Malaria", "Anémie falciforme", "Aucune", "Insuffisance rénale", "Bronchite"]
list_medocs = ["Insuline", "Amlodipine", "Artemether", "Hydroxyurée", "Aucun", "Paracétamol", "Salbutamol"]
list_activites = ["Repos", "Exercice léger", "Travail physique", "Sommeil", "Marche rapide"]
list_niveau_activite = ["Sédentaire", "Actif", "Très actif"]
list_regions = ["Dakar", "Thiès", "Saint-Louis", "Ziguinchor", "Kaolack", "Fatick"]

### Génération des données

In [6]:
data = []

for i in range(1, nombre_patients+1):
    sexe = random.choice(["Homme", "Femme"])
    
    if sexe == "Homme":
        prenom = random.choice(prenoms_male)
    else:
        prenom = random.choice(prenoms_female)
    nom_complet = f"{prenom} {random.choice(noms_senegalais)}"
    
    poids = round(random.uniform(45, 110), 1)  # Wider weight range
    taille = round(random.uniform(145, 205), 1)  # Wider height range
    imc = round(poids / (taille/100)**2, 1)
    
    age = random.randint(15, 85)  # Adjusted age range
    
    # More realistic heart rate and blood pressure based on age and pathology
    if age > 60:
        frequence_cardiaque = random.randint(60, 100)
        pression_systolique = random.randint(110, 160)
        pression_diastolique = random.randint(70, 95)
    else:
        frequence_cardiaque = random.randint(55, 110)
        pression_systolique = random.randint(100, 145)
        pression_diastolique = random.randint(60, 90)
    
    data.append({
        "patient_id": f'PAT{i:04d}',  # Different ID format
        "nom_patient": nom_complet,
        "age": age,
        "sexe": sexe,
        "poids_kg": poids,
        "taille_cm": taille,
        "imc": imc,
        "frequence_cardiaque_bpm": frequence_cardiaque,
        "pression_systolique": pression_systolique,
        "pression_diastolique": pression_diastolique,
        "oxygene_sanguin_pct": round(random.uniform(88, 99), 1),  # Slightly different range
        "niveau_stress": random.randint(0, 8),  # Adjusted stress range
        "activite": random.choice(list_activites),
        "pathologie": random.choice(list_pathologies),
        "medication": random.choice(list_medocs),
        "fumeur": random.choice(["Oui", "Non", "Occasionnel"]),  # Added occasional smoker
        "niveau_activite_physique": random.choice(list_niveau_activite),
        "region": random.choice(list_regions),  # New field for region
        "temperature_corporelle_c": round(random.uniform(36.0, 38.5), 1),  # New field for body temperature
        "timestamp": fake.date_time_between(start_date='-18m', end_date='now')  # Different time range
    })

df = pd.DataFrame(data)

df

Unnamed: 0,patient_id,nom_patient,age,sexe,poids_kg,taille_cm,imc,frequence_cardiaque_bpm,pression_systolique,pression_diastolique,oxygene_sanguin_pct,niveau_stress,activite,pathologie,medication,fumeur,niveau_activite_physique,region,temperature_corporelle_c,timestamp
0,PAT0001,Seynabou Toure,85,Femme,78.8,199.3,19.8,84,136,74,95.9,5,Repos,Hypertension artérielle,Hydroxyurée,Non,Actif,Kaolack,37.6,2025-10-21 21:20:18
1,PAT0002,Awa Toure,25,Femme,93.7,157.8,37.6,77,125,66,93.7,2,Marche rapide,Malaria,Aucun,Non,Actif,Kaolack,37.5,2025-10-21 21:19:36
2,PAT0003,Diarra Thiam,77,Femme,77.9,170.2,26.9,63,138,94,94.9,6,Travail physique,Aucune,Amlodipine,Non,Sédentaire,Dakar,36.8,2025-10-21 21:22:00
3,PAT0004,Bouna Camara,41,Homme,64.4,198.0,16.4,64,110,62,92.4,1,Marche rapide,Aucune,Artemether,Occasionnel,Sédentaire,Kaolack,37.6,2025-10-21 21:17:47
4,PAT0005,Bouna Lo,37,Homme,94.2,189.0,26.4,87,139,65,89.1,6,Repos,Hypertension artérielle,Hydroxyurée,Oui,Actif,Kaolack,36.2,2025-10-21 21:21:35
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1195,PAT1196,Malick Lo,59,Homme,67.8,204.2,16.3,55,139,81,88.7,6,Marche rapide,Malaria,Salbutamol,Non,Actif,Thiès,36.2,2025-10-21 21:34:33
1196,PAT1197,Abdoulaye Thiam,84,Homme,50.4,188.9,14.1,79,151,90,94.5,6,Travail physique,Insuffisance rénale,Artemether,Oui,Sédentaire,Saint-Louis,37.4,2025-10-21 21:27:18
1197,PAT1198,Modou Sall,37,Homme,46.1,188.5,13.0,67,130,72,89.6,1,Exercice léger,Malaria,Aucun,Occasionnel,Sédentaire,Thiès,38.2,2025-10-21 21:26:01
1198,PAT1199,Omar Lo,65,Homme,98.5,191.5,26.9,78,156,84,97.7,2,Travail physique,Aucune,Salbutamol,Occasionnel,Actif,Dakar,36.4,2025-10-21 21:18:19


### Export en CSV

In [7]:
df.to_csv("sa_datagen_data_analysis_poc_v2.csv", index=False)