# preprocesseur avec data de mean

In [299]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import FunctionTransformer
import matplotlib.pyplot as plt

In [300]:
data = pd.read_csv("partie_yacine\\train.csv")

In [302]:
data.head()

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.5
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.5
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.0
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74


In [303]:
data.shape

(6019, 13)

# fonction convertir la colonne prix en euro

In [304]:

def prix_euro(df):
    df["prix_euro"] = df["Price"] * 100000 * 0.011  # Conversion
    return df.drop(columns=["Price"], inplace=True)

prix_euro_transformer = FunctionTransformer(prix_euro, validate=False)

#  colonne supprimer les lignes >= 66000

In [283]:
def supprimer_lignes_cheres(df):
    # Supprimer les lignes où 'prix_euro' > 66 000
    df.drop(df[df["prix_euro"] >= 66000].index, axis=0, inplace=True)
    return df  # Garde uniquement les lignes <= 66 000€

# Transformer pour la pipeline
supprimer_lignes_cheres_transformer = FunctionTransformer(supprimer_lignes_cheres, validate=False)

In [231]:
supprimer_lignes_cheres(data)
data.shape

(5973, 13)

### fonction separation de la colonne "Name"

In [282]:
def split_name(nom_voiture):
    # Diviser le nom en mots
    mots = nom_voiture.split()
   
    # Si le nom contient "Land Rover", on prend les deux premiers mots comme Brand et le reste comme Model
    if 'Land' in mots and 'Rover' in mots:
        brand = ' '.join(mots[:2])
        if 'Range' in mots and 'Rover' in mots:
            model = ' '.join(mots[2:4])
        else:
            model = ' '.join(mots[2:3])
    else:
        brand = mots[0]
        model = mots[1] if len(mots) > 1 else ''
    return pd.Series([brand, model])

def ajouter_colonnes_marque_modele(df):
    # Appliquer la fonction split_name pour extraire 'Brand' et 'Model' à partir de 'Name'
    df[['Brand', 'Model']] = df['Name'].apply(split_name)
    return df


split_colonne = FunctionTransformer(split_name, validate=False) ## transformer pour mettre dans la pipeline

In [233]:
ajouter_colonnes_marque_modele(data)

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,prix_euro,Brand,Model
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1925.0,Maruti,Wagon
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,13750.0,Hyundai,Creta
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4950.0,Honda,Jazz
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6600.0,Maruti,Ertiga
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,19514.0,Audi,A4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6014,Maruti Swift VDI,Delhi,2014,27365,Diesel,Manual,First,28.4 kmpl,1248 CC,74 bhp,5.0,7.88 Lakh,5225.0,Maruti,Swift
6015,Hyundai Xcent 1.1 CRDi S,Jaipur,2015,100000,Diesel,Manual,First,24.4 kmpl,1120 CC,71 bhp,5.0,,4400.0,Hyundai,Xcent
6016,Mahindra Xylo D4 BSIV,Jaipur,2012,55000,Diesel,Manual,Second,14.0 kmpl,2498 CC,112 bhp,8.0,,3190.0,Mahindra,Xylo
6017,Maruti Wagon R VXI,Kolkata,2013,46000,Petrol,Manual,First,18.9 kmpl,998 CC,67.1 bhp,5.0,,2915.0,Maruti,Wagon


# fonction pour mettre en miniscul

In [None]:
def lowercase_categorical_columns(df):
    for col in df.select_dtypes(include=['object', 'category']).columns:
        df[col] = df[col].str.lower()
    return df
mettre_lowers = FunctionTransformer(lowercase_categorical_columns, validate=False) ## transformer pour mettre dans la pipeline

In [235]:
lowercase_categorical_columns(data)

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,prix_euro,Brand,Model
0,maruti wagon r lxi cng,mumbai,2010,72000,cng,manual,first,26.6 km/kg,998 cc,58.16 bhp,5.0,,1925.0,maruti,wagon
1,hyundai creta 1.6 crdi sx option,pune,2015,41000,diesel,manual,first,19.67 kmpl,1582 cc,126.2 bhp,5.0,,13750.0,hyundai,creta
2,honda jazz v,chennai,2011,46000,petrol,manual,first,18.2 kmpl,1199 cc,88.7 bhp,5.0,8.61 lakh,4950.0,honda,jazz
3,maruti ertiga vdi,chennai,2012,87000,diesel,manual,first,20.77 kmpl,1248 cc,88.76 bhp,7.0,,6600.0,maruti,ertiga
4,audi a4 new 2.0 tdi multitronic,coimbatore,2013,40670,diesel,automatic,second,15.2 kmpl,1968 cc,140.8 bhp,5.0,,19514.0,audi,a4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6014,maruti swift vdi,delhi,2014,27365,diesel,manual,first,28.4 kmpl,1248 cc,74 bhp,5.0,7.88 lakh,5225.0,maruti,swift
6015,hyundai xcent 1.1 crdi s,jaipur,2015,100000,diesel,manual,first,24.4 kmpl,1120 cc,71 bhp,5.0,,4400.0,hyundai,xcent
6016,mahindra xylo d4 bsiv,jaipur,2012,55000,diesel,manual,second,14.0 kmpl,2498 cc,112 bhp,8.0,,3190.0,mahindra,xylo
6017,maruti wagon r vxi,kolkata,2013,46000,petrol,manual,first,18.9 kmpl,998 cc,67.1 bhp,5.0,,2915.0,maruti,wagon


# fonction pour duplicated

In [280]:
def drop_duplicated(df):
    if df.duplicated().sum() > 0:
        df = df.drop_duplicates(inplace=True)
        return df
    return df

supprimer_doublons = FunctionTransformer(drop_duplicated, validate=False) ## transformer pour mettre dans la pipeline

In [237]:
drop_duplicated(data)

# fonction convertir en numerique et enlever les unité

In [279]:
def convert_to_float(df):
    colonnes = ['Mileage', 'Engine', 'Power']
    for col in colonnes:
        if col in df.columns:
            if df[col].dtype == object:
                # Si la colonne contient une virgule, on applique la transformation
                df[col] = df[col].str.extract(r'([0-9.]+)').astype(float)

            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df  

convertir_numerique = FunctionTransformer(convert_to_float, validate=False) ## transformer pour mettre dans la pipeline

In [239]:
convert_to_float(data)

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,prix_euro,Brand,Model
0,maruti wagon r lxi cng,mumbai,2010,72000,cng,manual,first,26.60,998.0,58.16,5.0,,1925.0,maruti,wagon
1,hyundai creta 1.6 crdi sx option,pune,2015,41000,diesel,manual,first,19.67,1582.0,126.20,5.0,,13750.0,hyundai,creta
2,honda jazz v,chennai,2011,46000,petrol,manual,first,18.20,1199.0,88.70,5.0,8.61 lakh,4950.0,honda,jazz
3,maruti ertiga vdi,chennai,2012,87000,diesel,manual,first,20.77,1248.0,88.76,7.0,,6600.0,maruti,ertiga
4,audi a4 new 2.0 tdi multitronic,coimbatore,2013,40670,diesel,automatic,second,15.20,1968.0,140.80,5.0,,19514.0,audi,a4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6014,maruti swift vdi,delhi,2014,27365,diesel,manual,first,28.40,1248.0,74.00,5.0,7.88 lakh,5225.0,maruti,swift
6015,hyundai xcent 1.1 crdi s,jaipur,2015,100000,diesel,manual,first,24.40,1120.0,71.00,5.0,,4400.0,hyundai,xcent
6016,mahindra xylo d4 bsiv,jaipur,2012,55000,diesel,manual,second,14.00,2498.0,112.00,8.0,,3190.0,mahindra,xylo
6017,maruti wagon r vxi,kolkata,2013,46000,petrol,manual,first,18.90,998.0,67.10,5.0,,2915.0,maruti,wagon


# fonction pour trouver les voiture similaires 

In [278]:
def trouver_voitures_similaires(df):
    # Ajouter une colonne 'Similar_Cars' vide
    df['Similar_Cars'] = None
    
    for index, row in df.iterrows():
        marque_voiture = row['Brand']
        modele_voiture = row['Model']
        fuel_type = row['Fuel_Type']
        transmission = row['Transmission']
        
        # Rechercher des voitures ayant la même marque, modèle, carburant et transmission
        similar_cars = df[
            (df['Brand'] == marque_voiture) & 
            (df['Model'] == modele_voiture) & 
            (df['Fuel_Type'] == fuel_type) & 
            (df['Transmission'] == transmission)
        ]
        
        # Si au moins 2 véhicules (lui-même + un autre) sont trouvés, on ajoute les résultats
        if len(similar_cars) >= 2:
            df.at[index, 'Similar_Cars'] = ', '.join(similar_cars['Name'].values)
    
    return df

voitures_similaires = FunctionTransformer(trouver_voitures_similaires, validate=False) ## transformer pour mettre dans la pipeline 


In [241]:
trouver_voitures_similaires(data)

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,prix_euro,Brand,Model,Similar_Cars
0,maruti wagon r lxi cng,mumbai,2010,72000,cng,manual,first,26.60,998.0,58.16,5.0,,1925.0,maruti,wagon,"maruti wagon r lxi cng, maruti wagon r lxi cng..."
1,hyundai creta 1.6 crdi sx option,pune,2015,41000,diesel,manual,first,19.67,1582.0,126.20,5.0,,13750.0,hyundai,creta,"hyundai creta 1.6 crdi sx option, hyundai cret..."
2,honda jazz v,chennai,2011,46000,petrol,manual,first,18.20,1199.0,88.70,5.0,8.61 lakh,4950.0,honda,jazz,"honda jazz v, honda jazz 1.2 vx i vtec, honda ..."
3,maruti ertiga vdi,chennai,2012,87000,diesel,manual,first,20.77,1248.0,88.76,7.0,,6600.0,maruti,ertiga,"maruti ertiga vdi, maruti ertiga shvs vdi, mar..."
4,audi a4 new 2.0 tdi multitronic,coimbatore,2013,40670,diesel,automatic,second,15.20,1968.0,140.80,5.0,,19514.0,audi,a4,"audi a4 new 2.0 tdi multitronic, audi a4 2.0 t..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6014,maruti swift vdi,delhi,2014,27365,diesel,manual,first,28.40,1248.0,74.00,5.0,7.88 lakh,5225.0,maruti,swift,"maruti swift vdi bsiv, maruti swift ddis vdi, ..."
6015,hyundai xcent 1.1 crdi s,jaipur,2015,100000,diesel,manual,first,24.40,1120.0,71.00,5.0,,4400.0,hyundai,xcent,"hyundai xcent 1.1 crdi s, hyundai xcent 1.1 cr..."
6016,mahindra xylo d4 bsiv,jaipur,2012,55000,diesel,manual,second,14.00,2498.0,112.00,8.0,,3190.0,mahindra,xylo,"mahindra xylo e8, mahindra xylo h4, mahindra x..."
6017,maruti wagon r vxi,kolkata,2013,46000,petrol,manual,first,18.90,998.0,67.10,5.0,,2915.0,maruti,wagon,"maruti wagon r lxi bsiii, maruti wagon r vxi b..."


# fonction remplacer les valeur NaN 

In [277]:
def remplir_valeurs_manquantes(df):
    for index, row in df.iterrows():
        # Liste des colonnes à compléter
        list_colonnes = ['Mileage', 'Engine', 'Power', 'Seats']
        for col in list_colonnes:
            # Vérifier si la valeur dans la cellule est manquante (NaN)
            if pd.isnull(row[col]):  
                # Rechercher des véhicules similaires avec le même carburant et transmission
                similar_cars = df[df['Similar_Cars'].notnull()]  # Utiliser la colonne 'Similar_Cars' déjà remplie
                if not similar_cars.empty:
                    # Récupérer la valeur la plus fréquente (mode) pour la colonne en question
                    mode_value = similar_cars[col].mode()
                    if not mode_value.empty:  # Si un mode existe, utiliser la première valeur trouvée
                        df.at[index, col] = mode_value.iloc[0]  
                    else:  # Si aucun mode n'est trouvé, utiliser la médiane de la colonne entière
                        df.at[index, col] = df[col].median()  
                else:
                    # Si aucune voiture similaire n'est trouvée, remplir avec la médiane de la colonne
                    df.at[index, col] = df[col].median()  
    return df.drop("Similar_Cars", axis=1, inplace=True)

valeurs_manquantes = FunctionTransformer(remplir_valeurs_manquantes, validate=False) ## transformer pour mettre dans la pipeline 



In [243]:
remplir_valeurs_manquantes(data)

# fonction pour eliminer les outliers 

In [276]:
def reduire_outliers(df):
    # Liste des colonnes à traiter
    list_colonnes = ['Mileage', 'Engine', 'Power', 'Seats', 'Kilometers_Driven']
    
    # Calcul des bornes IQR
    Q1 = df[list_colonnes].quantile(0.25)
    Q3 = df[list_colonnes].quantile(0.75)
    IQR = Q3 - Q1
    borne_inf = Q1 - 1.5 * IQR
    borne_sup = Q3 + 1.5 * IQR
    
    # Appliquer les bornes aux colonnes, réduisant les outliers
    df[list_colonnes] = df[list_colonnes].apply(
        lambda x: x.clip(lower=borne_inf[x.name], upper=borne_sup[x.name])  # Utilisation de apply pour chaque colonne
    )

    return df

outliers = FunctionTransformer(reduire_outliers, validate=False) ## transformer pour mettre dans la pipeline 

In [246]:
reduire_outliers(data)

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,prix_euro,Brand,Model
0,maruti wagon r lxi cng,mumbai,2010,72000.0,cng,manual,first,26.60,998.0,58.16,5.0,,1925.0,maruti,wagon
1,hyundai creta 1.6 crdi sx option,pune,2015,41000.0,diesel,manual,first,19.67,1582.0,126.20,5.0,,13750.0,hyundai,creta
2,honda jazz v,chennai,2011,46000.0,petrol,manual,first,18.20,1199.0,88.70,5.0,8.61 lakh,4950.0,honda,jazz
3,maruti ertiga vdi,chennai,2012,87000.0,diesel,manual,first,20.77,1248.0,88.76,5.0,,6600.0,maruti,ertiga
4,audi a4 new 2.0 tdi multitronic,coimbatore,2013,40670.0,diesel,automatic,second,15.20,1968.0,140.80,5.0,,19514.0,audi,a4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6014,maruti swift vdi,delhi,2014,27365.0,diesel,manual,first,28.40,1248.0,74.00,5.0,7.88 lakh,5225.0,maruti,swift
6015,hyundai xcent 1.1 crdi s,jaipur,2015,100000.0,diesel,manual,first,24.40,1120.0,71.00,5.0,,4400.0,hyundai,xcent
6016,mahindra xylo d4 bsiv,jaipur,2012,55000.0,diesel,manual,second,14.00,2498.0,112.00,5.0,,3190.0,mahindra,xylo
6017,maruti wagon r vxi,kolkata,2013,46000.0,petrol,manual,first,18.90,998.0,67.10,5.0,,2915.0,maruti,wagon


# fonction pour effacer colonne 

In [275]:
def drop_colonne(df):
    colonne_drop = ["New_Price", "Name"]
    colonne_drop = [col for col in colonne_drop if col in df.columns]
    df.drop(columns= colonne_drop, axis=1, inplace=True)
    return df
suprimer_colonne = FunctionTransformer(drop_colonne, validate=False) ## transformer pour mettre dans la pipeline 

In [248]:
drop_colonne(data)

Unnamed: 0,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,prix_euro,Brand,Model
0,mumbai,2010,72000.0,cng,manual,first,26.60,998.0,58.16,5.0,1925.0,maruti,wagon
1,pune,2015,41000.0,diesel,manual,first,19.67,1582.0,126.20,5.0,13750.0,hyundai,creta
2,chennai,2011,46000.0,petrol,manual,first,18.20,1199.0,88.70,5.0,4950.0,honda,jazz
3,chennai,2012,87000.0,diesel,manual,first,20.77,1248.0,88.76,5.0,6600.0,maruti,ertiga
4,coimbatore,2013,40670.0,diesel,automatic,second,15.20,1968.0,140.80,5.0,19514.0,audi,a4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6014,delhi,2014,27365.0,diesel,manual,first,28.40,1248.0,74.00,5.0,5225.0,maruti,swift
6015,jaipur,2015,100000.0,diesel,manual,first,24.40,1120.0,71.00,5.0,4400.0,hyundai,xcent
6016,jaipur,2012,55000.0,diesel,manual,second,14.00,2498.0,112.00,5.0,3190.0,mahindra,xylo
6017,kolkata,2013,46000.0,petrol,manual,first,18.90,998.0,67.10,5.0,2915.0,maruti,wagon


# Preprocesseur

### fonction encodage & normaliser 

In [274]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


def transformer(df):
    
    # label = LabelEncoder()
    # df["Brand"] = label.fit_transform(df["Brand"]) 
    # df["Model"] = label.fit_transform(df["Model"]) 


      
    numerical_cols = df.select_dtypes(include=np.number).columns
    categorical_cols_onehot = ["Fuel_Type","Transmission","Owner_Type", "Brand", "Model"]


    processeur = ColumnTransformer([
        ("numerical", StandardScaler(), numerical_cols),
        ("categorical", OneHotEncoder(sparse_output=False), categorical_cols_onehot)
    ])

    df_transformer = processeur.fit_transform(df)

    return df_transformer

transformers = FunctionTransformer(transformer, validate=False) ## transformer pour mettre dans la pipeline

array([[-1.02174566,  0.52404045,  1.94384546, ...,  0.        ,
         0.        ,  0.        ],
       [ 0.50846426, -0.5097105 ,  0.33248955, ...,  0.        ,
         0.        ,  0.        ],
       [-0.71570367, -0.34297648, -0.00931321, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [-0.40966169, -0.04285523, -0.98589255, ...,  0.        ,
         0.        ,  0.        ],
       [-0.10361971, -0.34297648,  0.15345001, ...,  0.        ,
         0.        ,  0.        ],
       [-0.71570367, -0.30962967,  1.67412354, ...,  0.        ,
         0.        ,  0.        ]])