# Optimisation des datasets
Dans un souci d'optimisation de l'espace mémoire occupé, nous allons faire en sorte d'optimiser les différents dataframe que nous utilisons. 
Pour mener à bien ceci, nous allons des fonctions contenues dans `describe_csv.py`, en les améliorant pour automatiser le processus.

In [24]:
# Import des librairies
import pandas as pd
import numpy as np
import describe_csv

In [47]:
# Test dataset 
df_test = pd.read_csv('./data/athlete_events.csv')

In [26]:
df_test.head()
# Poids du dataset 
df_test_init_size = df_test.memory_usage(deep=True).sum() / 1024**2 # in Mo

In [45]:
# Test 
import pandas as pd

def optimize_data_types(df):
    """
    Optimize data types of columns in a pandas DataFrame.

    Parameters:
        df (pandas.DataFrame): Input DataFrame.

    Returns:
        pandas.DataFrame: DataFrame with optimized data types.
    """
    optimized_df = df.copy()
    
    for col in optimized_df.columns:
        col_dtype = optimized_df[col].dtype
        
        if col_dtype == 'object':
            # Convert object type columns to category if unique values are less than 50% of total rows
            if len(optimized_df[col].unique()) / len(optimized_df[col]) < 0.5:
                optimized_df[col] = optimized_df[col].astype('category')
        elif col_dtype == 'int64':
            # Convert int64 to smaller integer types if possible
            optimized_df[col] = pd.to_numeric(optimized_df[col], downcast='integer', errors='coerce')
        elif col_dtype == 'float64':
            # Convert float64 to smaller floating point types if possible
            optimized_df[col] = pd.to_numeric(optimized_df[col], downcast='float', errors='coerce')
        elif col_dtype == 'bool':
            # Convert boolean columns to integer type
            optimized_df[col] = optimized_df[col].astype('int8')

    return optimized_df


In [49]:
optimize_data_types(df_test)


Unnamed: 0,Name,Sex,Age,Height,Weight,Team,NOC,Games,Year,Season,City,Sport,Event,Medal
0,A Dijiang,M,24.0,180.00,80.0,China,CHN,1992 Summer,1992,Summer,Barcelona,Basketball,Basketball Men's Basketball,
1,A Lamusi,M,23.0,170.00,60.0,China,CHN,2012 Summer,2012,Summer,London,Judo,Judo Men's Extra-Lightweight,
2,Gunnar Nielsen Aaby,M,24.0,,,Denmark,DEN,1920 Summer,1920,Summer,Antwerpen,Football,Football Men's Football,
3,Edgar Lindenau Aabye,M,34.0,,,Denmark/Sweden,DEN,1900 Summer,1900,Summer,Paris,Tug-Of-War,Tug-Of-War Men's Tug-Of-War,Gold
4,Christine Jacoba Aaftink,F,21.0,185.00,82.0,Netherlands,NED,1988 Winter,1988,Winter,Calgary,Speed Skating,Speed Skating Women's 500 metres,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
282985,ZOU Jingyuan,M,23.0,1.58,,People's Republic of China,CHN,Tokyo 2020,2020,Summer,Tokyo,Artistic Gymnastics,Men's Parallel Bars,Gold
282986,ZUBIMENDI Martin,M,22.0,1.80,,Spain,ESP,Tokyo 2020,2020,Summer,Tokyo,Football,Men,Silver
282987,ZUEV Alexander,M,25.0,1.93,,ROC,ROC,Tokyo 2020,2020,Summer,Tokyo,3x3 Basketball,Men,Silver
282988,ZVEREV Alexander,M,24.0,1.98,,Germany,GER,Tokyo 2020,2020,Summer,Tokyo,Tennis,Men's Singles,Gold


In [50]:
df_test.memory_usage(deep=True).sum() / 1024**2 # in Mo

182.76852130889893

In [41]:
df_test['Age'] = pd.to_numeric(df_test['Age'], errors='coerce', downcast='integer')

# Convert 'Height' and 'Weight' columns to int16
df_test['Height'] = pd.to_numeric(df_test['Height'], errors='coerce', downcast='integer')
df_test['Weight'] = pd.to_numeric(df_test['Weight'], errors='coerce', downcast='integer')


In [43]:
df_test

Name       object
Sex        object
Age       float64
Height    float16
Weight    float16
Team       object
NOC        object
Games      object
Year        int16
Season     object
City       object
Sport      object
Event      object
Medal      object
dtype: object