In [115]:
import pandas as pd 
from sklearn.preprocessing import FunctionTransformer

In [116]:
data = pd.read_csv("partie_yacine\\train.csv")

In [117]:
data

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Price
0,Maruti Wagon R LXI CNG,Mumbai,2010,72000,CNG,Manual,First,26.6 km/kg,998 CC,58.16 bhp,5.0,,1.75
1,Hyundai Creta 1.6 CRDi SX Option,Pune,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,,12.50
2,Honda Jazz V,Chennai,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh,4.50
3,Maruti Ertiga VDI,Chennai,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,,6.00
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,,17.74
...,...,...,...,...,...,...,...,...,...,...,...,...,...
6014,Maruti Swift VDI,Delhi,2014,27365,Diesel,Manual,First,28.4 kmpl,1248 CC,74 bhp,5.0,7.88 Lakh,4.75
6015,Hyundai Xcent 1.1 CRDi S,Jaipur,2015,100000,Diesel,Manual,First,24.4 kmpl,1120 CC,71 bhp,5.0,,4.00
6016,Mahindra Xylo D4 BSIV,Jaipur,2012,55000,Diesel,Manual,Second,14.0 kmpl,2498 CC,112 bhp,8.0,,2.90
6017,Maruti Wagon R VXI,Kolkata,2013,46000,Petrol,Manual,First,18.9 kmpl,998 CC,67.1 bhp,5.0,,2.65


In [118]:
def prix_euro(df):
    df["prix_euro"] = df["Price"] * 100000 * 0.011  # Conversion
    if "Price" in df.columns:
        df = df.drop(columns=["Price"])
    return df

prix_euro_transformer = FunctionTransformer(prix_euro, validate=False)

In [119]:
def supprimer_lignes_cheres(df):
    # Supprimer les lignes où 'prix_euro' > 66 000
    df = df.drop(df[df["prix_euro"] >= 66000].index, axis=0)
    return df  # Garde uniquement les lignes <= 66 000€

# Transformer pour la pipeline
supprimer_lignes_cheres_transformer = FunctionTransformer(supprimer_lignes_cheres, validate=False)

In [120]:
def split_name(nom_voiture):
    # Diviser le nom en mots
    mots = nom_voiture.split()
   
    # Si le nom contient "Land Rover", on prend les deux premiers mots comme Brand et le reste comme Model
    if 'Land' in mots and 'Rover' in mots:
        brand = ' '.join(mots[:2])
        if 'Range' in mots and 'Rover' in mots:
            model = ' '.join(mots[2:4])
        else:
            model = ' '.join(mots[2:3])
    else:
        brand = mots[0]
        model = mots[1] if len(mots) > 1 else ''
    return pd.Series([brand, model])

def ajouter_colonnes_marque_modele(df):
    # Appliquer la fonction split_name pour extraire 'Brand' et 'Model' à partir de 'Name'
    df[['Brand', 'Model']] = df['Name'].apply(split_name)
    return df


ajouter_colonnes_marque_modeles = FunctionTransformer(ajouter_colonnes_marque_modele, validate=False) ## transformer pour mettre dans la pipeline

In [121]:
def lowercase_categorical_columns(df):
    for col in df.select_dtypes(include=['object', 'category']).columns:
        df[col] = df[col].str.lower()
    return df
mettre_lowers = FunctionTransformer(lowercase_categorical_columns, validate=False) ## transformer pour mettre dans la pipeline

In [122]:
def drop_duplicated(df):
    df = df.drop_duplicates()
    return df
    

supprimer_doublons = FunctionTransformer(drop_duplicated, validate=False) ## transformer pour mettre dans la pipeline

In [123]:
def convert_to_float(df):
    colonnes = ['Mileage', 'Engine', 'Power']
    for col in colonnes:
        if col in df.columns:
            if df[col].dtype == object:
                # Si la colonne contient une virgule, on applique la transformation
                df[col] = df[col].str.extract(r'([0-9.]+)').astype(float)

            df[col] = pd.to_numeric(df[col], errors='coerce')
    return df  

convertir_numerique = FunctionTransformer(convert_to_float, validate=False) ## transformer pour mettre dans la pipeline

In [124]:
from sklearn.pipeline import Pipeline

In [125]:
feature_eda = Pipeline([
    ("prix_euro_transformer",prix_euro_transformer),
    ("supprimer_lignes_cheres_transformer",supprimer_lignes_cheres_transformer),
    ("ajouter_colonnes_marque_modeles",ajouter_colonnes_marque_modeles),
    ("mettre_lowers",mettre_lowers),
    ("supprimer_doublons",supprimer_doublons),
    ("convertir_numerique",convertir_numerique),
])

In [126]:
new_data = feature_eda.fit_transform(data)

In [127]:
new_data

Unnamed: 0,Name,Location,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,prix_euro,Brand,Model
0,maruti wagon r lxi cng,mumbai,2010,72000,cng,manual,first,26.60,998.0,58.16,5.0,,1925.0,maruti,wagon
1,hyundai creta 1.6 crdi sx option,pune,2015,41000,diesel,manual,first,19.67,1582.0,126.20,5.0,,13750.0,hyundai,creta
2,honda jazz v,chennai,2011,46000,petrol,manual,first,18.20,1199.0,88.70,5.0,8.61 lakh,4950.0,honda,jazz
3,maruti ertiga vdi,chennai,2012,87000,diesel,manual,first,20.77,1248.0,88.76,7.0,,6600.0,maruti,ertiga
4,audi a4 new 2.0 tdi multitronic,coimbatore,2013,40670,diesel,automatic,second,15.20,1968.0,140.80,5.0,,19514.0,audi,a4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6014,maruti swift vdi,delhi,2014,27365,diesel,manual,first,28.40,1248.0,74.00,5.0,7.88 lakh,5225.0,maruti,swift
6015,hyundai xcent 1.1 crdi s,jaipur,2015,100000,diesel,manual,first,24.40,1120.0,71.00,5.0,,4400.0,hyundai,xcent
6016,mahindra xylo d4 bsiv,jaipur,2012,55000,diesel,manual,second,14.00,2498.0,112.00,8.0,,3190.0,mahindra,xylo
6017,maruti wagon r vxi,kolkata,2013,46000,petrol,manual,first,18.90,998.0,67.10,5.0,,2915.0,maruti,wagon


In [128]:
new_data.to_csv("data_avec_FE.csv", index=False)