In [6]:
pip install pandas

Collecting pandas
  Obtaining dependency information for pandas from https://files.pythonhosted.org/packages/a0/68/265225df9e90ade0c332db4148e9aff8c9bcb4e8dd6c681ec4f512770765/pandas-2.1.3-cp312-cp312-macosx_11_0_arm64.whl.metadata
  Downloading pandas-2.1.3-cp312-cp312-macosx_11_0_arm64.whl.metadata (18 kB)
Collecting numpy<2,>=1.26.0 (from pandas)
  Obtaining dependency information for numpy<2,>=1.26.0 from https://files.pythonhosted.org/packages/2a/17/1fdc154e75d24d8c20c42b71bae1b5cf752453f0fc3a2504bbb810293dd1/numpy-1.26.2-cp312-cp312-macosx_11_0_arm64.whl.metadata
  Downloading numpy-1.26.2-cp312-cp312-macosx_11_0_arm64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.2/61.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting pytz>=2020.1 (from pandas)
  Obtaining dependency information for pytz>=2020.1 from https://files.pythonhosted.org/packages/32/4d/aaf7eff5deb402fd9a24a1449a8119f00d74ae9c2efa79f8ef9994261fc2/pytz-2023.3.post1-py2.py

In [4]:
import pandas as pd

In [21]:
# Load the data
df_age = pd.read_csv('data/code-tranches-dage-donnees-urgences.csv', delimiter=';')

# Display the original dataframe
print("Original Data:")
print(df_age)


Original Data:
   Code tranches d'age         Unnamed: 1
0                    0        "Tous âges"
1                    1          "0-4 ans"
2                    2         "5-14 ans"
3                    3        "15-44 ans"
4                    4        "45-64 ans"
5                    5        "65-74 ans"
6                    6   "75 ans ou plus"


In [14]:
df_age = pd.read_csv('data/code-tranches-dage-donnees-urgences.csv', delimiter=';', header=None)


# Rename the columns
df_age.columns = ['Code tranches d\'age', 'tranche d\'age']

# Remove unnecessary characters from the 'tranche d\'age' column
df_age['tranche d\'age'] = df_age['tranche d\'age'].str.replace('"', '')
df_age['tranche d\'age'] = df_age['tranche d\'age'].str.replace(' ans', '')

# Split the age ranges and create 'age min' and 'age max' columns
df_age[['age min', 'age max']] = df_age['tranche d\'age'].str.split('-', expand=True)

# Convert the new columns to numeric values
df_age['age min'] = pd.to_numeric(df_age['age min'], errors='coerce')
df_age['age max'] = pd.to_numeric(df_age['age max'].str.replace(' ou plus', ''), errors='coerce')

# Set specific values for 'Tous âges' and '75 ou plus'
df_age.loc[df_age['tranche d\'age'].fillna('').str.contains('Tous âges'), ['age min', 'age max']] = [0, 120]
df_age.loc[df_age['tranche d\'age'].fillna('').str.contains('75 ou plus'), ['age min', 'age max']] = [75, 120]

# Remove rows where 'Code tranches d'age' is 'Code tranches d'age'
cleaned_df_age = df_age[df_age['Code tranches d\'age'] != 'Code tranches d\'age'].copy()
# Assuming cleaned_df_age is your DataFrame containing the data
cleaned_df_age.drop(columns=['tranche d\'age'], inplace=True)

# Display the updated dataframe
print("\nCleaned Data:")
print(cleaned_df_age)

# Save the cleaned DataFrame to a new CSV file
cleaned_df_age.to_csv('data/cleaned_age_data.csv', index=False)



Cleaned Data:
  Code tranches d'age  age min  age max
1                   0      0.0    120.0
2                   1      0.0      4.0
3                   2      5.0     14.0
4                   3     15.0     44.0
5                   4     45.0     64.0
6                   5     65.0     74.0
7                   6     75.0    120.0


In [16]:
import json

# Read the JSON file
with open('data/departements-region.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

# Generate a mapping of alphanumeric values to integers
alpha_numeric_mapping = {entry['num_dep']: idx for idx, entry in enumerate(data, start=1)}

# Modify the 'num_dep' values in the JSON data using the mapping
for entry in data:
    entry['num_dep'] = alpha_numeric_mapping[entry['num_dep']]

# Convert to DataFrame
df = pd.DataFrame(data)

# Display the cleaned DataFrame with integer values in the 'num_dep' column
df_cleaned_dept = df.rename(columns={'num_dep': 'num_dep_int'})  # Renaming for clarity
print(df_cleaned_dept.head(10))
print(df_cleaned_dept.tail(10))

# Save the cleaned DataFrame to a new CSV file
df_cleaned_dept.to_csv('data/cleaned_dept_data.csv', index=False)


   num_dep_int                 dep_name                 region_name
0            1                      Ain        Auvergne-Rhône-Alpes
1            2                    Aisne             Hauts-de-France
2            3                   Allier        Auvergne-Rhône-Alpes
3            4  Alpes-de-Haute-Provence  Provence-Alpes-Côte d'Azur
4            5             Hautes-Alpes  Provence-Alpes-Côte d'Azur
5            6          Alpes-Maritimes  Provence-Alpes-Côte d'Azur
6            7                  Ardèche        Auvergne-Rhône-Alpes
7            8                 Ardennes                   Grand Est
8            9                   Ariège                   Occitanie
9           10                     Aube                   Grand Est
     num_dep_int           dep_name    region_name
91            92            Essonne  Île-de-France
92            93     Hauts-de-Seine  Île-de-France
93            94  Seine-Saint-Denis  Île-de-France
94            95       Val-de-Marne  Île-de-Fran

In [28]:
# Load CSV into a DataFrame
df_urgences = pd.read_csv('data/donnees-urgences-SOS-medecins.csv', delimiter=';', low_memory=False)

# Remove non-numeric characters from the 'dep' column
df_urgences['dep'] = df_urgences['dep'].str.replace(r'\D', '', regex=True)

# Convert the 'dep' column to integers
df_urgences['dep'] = pd.to_numeric(df_urgences['dep'], errors='coerce')
print(df_urgences.head(15))

    dep date_de_passage  sursaud_cl_age_corona  nbre_pass_corona  \
0     1      2022-12-26                      0               9.0   
1     1      2022-12-26                      1               0.0   
2     1      2022-12-26                      2               0.0   
3     1      2022-12-26                      3               1.0   
4     1      2022-12-26                      4               1.0   
5     1      2022-12-26                      5               1.0   
6     1      2022-12-26                      6               6.0   
7     1      2022-12-27                      0               6.0   
8     1      2022-12-27                      1               0.0   
9     1      2022-12-27                      2               0.0   
10    1      2022-12-27                      3               0.0   
11    1      2022-12-27                      4               0.0   
12    1      2022-12-27                      5               4.0   
13    1      2022-12-27                      6  

In [19]:
# List of columns to drop
columns_to_drop = [
    'nbre_acte_corona', 'nbre_acte_tot', 'nbre_acte_corona_h', 'nbre_acte_corona_f',
    'nbre_acte_tot_h', 'nbre_acte_tot_f'
]

# Drop the unnecessary columns
df_cleaned_urgences = df_urgences.drop(columns=columns_to_drop, errors='ignore')

print(df_cleaned_urgences.head())

  dep date_de_passage  sursaud_cl_age_corona  nbre_pass_corona  nbre_pass_tot  \
0  01      2022-12-26                      0               9.0          435.0   
1  01      2022-12-26                      1               0.0           58.0   
2  01      2022-12-26                      2               0.0           30.0   
3  01      2022-12-26                      3               1.0          138.0   
4  01      2022-12-26                      4               1.0           88.0   

   nbre_hospit_corona  nbre_pass_corona_h  nbre_pass_corona_f  \
0                 4.0                 6.0                 3.0   
1                 0.0                 NaN                 NaN   
2                 0.0                 NaN                 NaN   
3                 0.0                 NaN                 NaN   
4                 0.0                 NaN                 NaN   

   nbre_pass_tot_h  nbre_pass_tot_f  nbre_hospit_corona_h  \
0            219.0            216.0                   1.0   
