In [1]:
import os
import s3fs
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline

In [2]:
S3_ENDPOINT_URL = "https://" + os.environ['AWS_S3_ENDPOINT']

fs = s3fs.S3FileSystem(client_kwargs = {'endpoint_url' : S3_ENDPOINT_URL})

BUCKET = "malcouffe1/Module_1"
FILE_KEY_S3 = "/readmission_avc.parquet"
FILE_PATH_S3 = BUCKET + FILE_KEY_S3

In [3]:
with fs.open(FILE_PATH_S3, 'rb') as file_in:
    df = pd.read_parquet(file_in)

In [4]:
df

Unnamed: 0,modeEntree,modeSortie,duree,ghm2,dp,sexe,age,nbActe,nbRum,nbda,id,id_D
0,8,9,0,01M37E,I671,2.0,76.0,4,1,,l19,
1,8,8,3,01C061,I652,2.0,77.0,4,1,1.0,s7e,
2,8,7,13,01M303,I634,,,4,1,7.0,23f,
3,8,8,11,01M301,I639,1.0,83.0,4,2,2.0,8oi,
4,8,6,8,01M303,I635,1.0,71.0,4,1,9.0,otz,ld
...,...,...,...,...,...,...,...,...,...,...,...,...
1695,8,7,1,01M30T,I614,1.0,88.0,4,1,4.0,kjg,
1696,8,6,10,01M303,I635,1.0,81.0,10,3,7.0,gie,my
1697,8,8,8,01M301,I639,1.0,68.0,5,3,6.0,6bl,
1698,8,8,11,01M301,I676,2.0,28.0,16,5,7.0,7m8,


## 1. Data pre-processing

In [5]:
df.isna().sum()

modeEntree      0
modeSortie      0
duree           0
ghm2            0
dp              0
sexe           20
age            20
nbActe          0
nbRum           0
nbda          134
id              0
id_D          200
dtype: int64

On peut supprimer les lignes ou id_D est nul car on ne peut pas préduire dans ce cas la (id_D est la target).

In [6]:
df = df.dropna(subset="id_D", axis=0)

In [7]:
df.loc[:, 'rea'] = df.loc[:, 'id_D'].apply(lambda x : 1 if x != '' else 0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:, 'rea'] = df.loc[:, 'id_D'].apply(lambda x : 1 if x != '' else 0)


In [8]:
df

Unnamed: 0,modeEntree,modeSortie,duree,ghm2,dp,sexe,age,nbActe,nbRum,nbda,id,id_D,rea
0,8,9,0,01M37E,I671,2.0,76.0,4,1,,l19,,0
1,8,8,3,01C061,I652,2.0,77.0,4,1,1.0,s7e,,0
2,8,7,13,01M303,I634,,,4,1,7.0,23f,,0
4,8,6,8,01M303,I635,1.0,71.0,4,1,9.0,otz,ld,1
6,8,7,8,01M302,I639,1.0,92.0,7,2,4.0,np7,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1695,8,7,1,01M30T,I614,1.0,88.0,4,1,4.0,kjg,,0
1696,8,6,10,01M303,I635,1.0,81.0,10,3,7.0,gie,my,1
1697,8,8,8,01M301,I639,1.0,68.0,5,3,6.0,6bl,,0
1698,8,8,11,01M301,I676,2.0,28.0,16,5,7.0,7m8,,0


In [9]:
# On convertit les colonnes str dans le bon format :

str_col = ['modeEntree', 'modeSortie', 'sexe']

df.loc[:, str_col] = df.loc[:, str_col].map(str)

df = df.replace('nan', np.nan)

  df.loc[:, str_col] = df.loc[:, str_col].map(str)
  df.loc[:, str_col] = df.loc[:, str_col].map(str)
  df.loc[:, str_col] = df.loc[:, str_col].map(str)


In [14]:
df.isna().sum()

modeEntree      0
modeSortie      0
duree           0
ghm2            0
dp              0
sexe           19
age            19
nbActe          0
nbRum           0
nbda          121
id              0
id_D            0
rea             0
dtype: int64

On voit que la colonne nbda pose problème. IL s'agit du nombre de comorbidité et on ne sait pas si les valeurs nulles sont 0 ou inconnues. Dans ce cas après consultation avec le métier, les NaN sont des 0 donc on peut les remplacer par des 0.

In [17]:
df['nbda'] = df['nbda'].fillna(0)

In [18]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1500 entries, 0 to 1699
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   modeEntree  1500 non-null   object 
 1   modeSortie  1500 non-null   object 
 2   duree       1500 non-null   int32  
 3   ghm2        1500 non-null   object 
 4   dp          1500 non-null   object 
 5   sexe        1481 non-null   object 
 6   age         1481 non-null   float64
 7   nbActe      1500 non-null   int32  
 8   nbRum       1500 non-null   int32  
 9   nbda        1500 non-null   float64
 10  id          1500 non-null   object 
 11  id_D        1500 non-null   object 
 12  rea         1500 non-null   int64  
dtypes: float64(2), int32(3), int64(1), object(7)
memory usage: 146.5+ KB
