# Importation des libraries

In [6]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import os
import re
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.pipeline import FeatureUnion, make_pipeline
from sklearn_pandas import DataFrameMapper, CategoricalImputer
from sklearn.preprocessing import Imputer

# Data Source

In [2]:
NOTEBOOK_DIR = os.getcwd()
BASE_DIR = os.path.dirname(NOTEBOOK_DIR)
DATA_DIR = os.path.join(BASE_DIR, "data")
DATA_DIR_PATH = os.path.join(DATA_DIR, "Data_cars.csv")
DATA_ZIP_DIR_PATH = os.path.join(DATA_DIR, "Data_cars.csv.zip")

In [3]:
if not os.path.exists(DATA_DIR_PATH):
    !unzip "${DATA_ZIP_DIR_PATH}"

df = pd.read_csv(DATA_DIR_PATH)

df.head()

Unnamed: 0,Price,Make,Model,Model_year,Mileage,Fuel,Gearbox,Online,Description
0,11220.0,PEUGEOT,308,2014,94341.0 km,Diesel,mécanique,06/07/2018 à 3h47,"modele: 308 (2E GENERATION), version: 1.6 HDI ..."
1,57526.0,BMW,X6,2015,39051.0 km,Diesel,automatique,03/04/2018 à 16h41,"modele: X6 F16, version: (F16) XDRIVE30D 258 M..."
2,80379.0,AUDI,RS6,2014,75381.0 km,Essence,automatique,30/07/2018 à 1h55,"modele: RS6 (3E GENERATION) AVANT, version: II..."
3,2830.0,FORD,KA,2007,92282.0 km,Essence,mécanique,09/07/2018 à 14h12,"modele: KA, version: 1.3 70 FUN, puissance_fis..."
4,12925.0,FIAT,TIPO,2018,10.0 km,Essence,mécanique,19/05/2018 à 3h52,"modele: TIPO 2 SW, version: II SW 1.4 95 POP, ..."


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166695 entries, 0 to 166694
Data columns (total 9 columns):
Price          166695 non-null float64
Make           166695 non-null object
Model          166695 non-null object
Model_year     166695 non-null int64
Mileage        166695 non-null object
Fuel           166695 non-null object
Gearbox        166695 non-null object
Online         166695 non-null object
Description    166695 non-null object
dtypes: float64(1), int64(1), object(7)
memory usage: 11.4+ MB


# Industrialization Model Machine Learning

In [5]:
class DateOnlineEncoder(TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        d_t = [datetime.strptime(c[0], "%d/%m/%Y à %Hh%M") for c in X.values]
        return np.array(d_t)
    
class AgeFeature(TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        #d_t = [datetime.strptime(c, "%d/%m/%Y à %Hh%M") for c in X.iloc[:,0].values]
        age = [np.abs(a.year - b) for (a, b) in zip(X.iloc[:,0].values, X.iloc[:,1].values)]
        return np.array(age)

class MileageClean(TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        #assert isinstance(X, pd.DataFrame)
        return np.array([float(m[0].rstrip('km').strip()) for m in np.array(X.values).reshape(-1, 1)]).reshape(-1,1)
    
class DesciptionClean(TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        def parser():
            regex_pattern = r"modele:\s*(?P<modele>.*?(?=,)),\sversion:\s*(?P<version>.*?(?=,)),\spuissance_fiscale:\s*(?P<puissance_fiscale>.*?(?=,)),\sportes:\s*(?P<portes>.*?(?=,)),\soptions:\s*(?P<Descriptions>.*?(?=,)),\scouleur:\s(?P<couleur>.*$)"
            regex_cyclindre = "\d+[\.,]\d+"
            regex_cv = "\s+\d{1,3}\s?"
            #version = []
            #puissance_fiscale = []
            #portes = []
            #options = []
            #couleur = []
            for i in range(X.shape[0]):
                match = re.search(regex_pattern, X.values[i][0])
                version = match.group(2)
                if str(version) == 'ii allurehdifap2.0150cv':
                    version = 'ii allurehdifap 2.0 150cv'
                version = re.sub("\d+[\.,]\d+km", "", version)
                version = re.sub("(159.226|76.538|87.480|71.000)", "", version)
                cl = re.findall(regex_cyclindre, version)
                version = re.sub(regex_cyclindre, "", version)
                version = re.sub("\d+p", "", version)
                cv = re.findall(regex_cv, version)
                if len(cl) == 0:
                    cl = np.nan
                else:
                    cl = float(cl[0].strip().replace(",", "."))
                if len(cv) == 0:
                    cv = np.nan
                else:
                    cv = int(float(cv[0].strip()))
                #version.append(match.group(2))
                #puissance_fiscale.append(match.group(3))
                #portes.append(match.group(4))
                #options.append(match.group(5))
                #couleur.append(match.group(6))
                yield [cl, cv, pd.to_numeric(match.group(3)), pd.to_numeric(match.group(4)), str(match.group(6)).lower()]

        return pd.DataFrame.from_records(list(parser()))

class LinearRegressorImputer(TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        missing_data_price = X.loc[X.iloc[:,0].isnull().values, "Price"].values.reshape(-1,1)
        missing_data_index = X.loc[X.iloc[:,0].isnull().values, "Price"].index.values.reshape(-1,1)
        dt = X.iloc[X.iloc[:,0].notnull(), :]

        m = LinearRegression()
        m.fit(X = dt["Price"].values.reshape(-1,1), y = dt.loc[:, 0].values.reshape(-1,1))
        missing_data_pred = m.predict(missing_data_price)
        X.loc[missing_data_index, 0] = missing_data_pred.reshape(1, -1)[0]
        return X

class CategoryType(TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        #assert isinstance(X, pd.DataFrame)
        return X.astype("object")

class TypeSelector(BaseEstimator, TransformerMixin):
    def __init__(self, dtype):
        self.dtype = dtype

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        return X.select_dtypes(include=[self.dtype])

best_param_cv = {
    'bootstrap': True,
    'max_depth': 80,
    'max_features': 2,
    'min_samples_split': 8,
    'min_samples_leaf': 3,
    'n_estimators': 100
}
rf = RandomForestRegressor(**best_param_cv)
rf.fit(X_train, y_train.values)
evaluate(rf, X_test, y_test.values)

In [7]:
preprocessing_mapper = DataFrameMapper([
    (["Online"], DateOnlineEncoder()),
    #(["Online", "Model_year"], AgeFeature(), {'alias': 'age'}),
    ("Make", [CategoricalImputer(), CategoryType()]),
    ("Model", [CategoryType(), CategoricalImputer()]),
    ("Model_year", CategoricalImputer()),
    ("Mileage", [MileageClean(), Imputer(strategy='mean')]),
    ("Fuel", [CategoryType(), CategoricalImputer()]),
    ("Gearbox", [CategoryType(), CategoricalImputer()]),
    #(["Description"], DesciptionClean()),
], input_df=True, df_out=True, default=None)

data_preprocessing = preprocessing_mapper.fit_transform(df)
data_preprocessing.head()

Unnamed: 0,Online,Make,Model,Model_year,Mileage,Fuel,Gearbox,Price,Description
0,2018-07-06 03:47:00,PEUGEOT,308,2014,94341.0,Diesel,mécanique,11220.0,"modele: 308 (2E GENERATION), version: 1.6 HDI ..."
1,2018-04-03 16:41:00,BMW,X6,2015,39051.0,Diesel,automatique,57526.0,"modele: X6 F16, version: (F16) XDRIVE30D 258 M..."
2,2018-07-30 01:55:00,AUDI,RS6,2014,75381.0,Essence,automatique,80379.0,"modele: RS6 (3E GENERATION) AVANT, version: II..."
3,2018-07-09 14:12:00,FORD,KA,2007,92282.0,Essence,mécanique,2830.0,"modele: KA, version: 1.3 70 FUN, puissance_fis..."
4,2018-05-19 03:52:00,FIAT,TIPO,2018,10.0,Essence,mécanique,12925.0,"modele: TIPO 2 SW, version: II SW 1.4 95 POP, ..."


In [8]:
features_mapper = DataFrameMapper([
    (["Online", "Model_year"], AgeFeature(), {'alias': 'age'}),
    #(["Make"], CategoryType()),
    #(["Model"], CategoryType()),
    (["Model_year"], None),
    #(["Mileage"], MileageClean()),
    #(["Fuel"], CategoryType()),
    #(["Gearbox"], CategoryType()),
    (["Description"], DesciptionClean()),
], input_df=True, df_out=True, default=None)

data_extra_features = features_mapper.fit_transform(data_preprocessing)
data_extra_features.head()

Unnamed: 0,age,Model_year,Description_0,Description_1,Description_2,Description_3,Description_4,Make,Model,Mileage,Fuel,Gearbox,Price
0,4,2014,1.6,92.0,4,5.0,blanc banquise,PEUGEOT,308,94341.0,Diesel,mécanique,11220.0
1,3,2015,,258.0,16,5.0,carbonschwarz metallise,BMW,X6,39051.0,Diesel,automatique,57526.0
2,4,2014,4.0,560.0,47,5.0,noir panthere cristal,AUDI,RS6,75381.0,Essence,automatique,80379.0
3,11,2007,1.3,70.0,5,3.0,jaune,FORD,KA,92282.0,Essence,mécanique,2830.0
4,0,2018,1.4,95.0,5,5.0,blanc verni,FIAT,TIPO,10.0,Essence,mécanique,12925.0


In [None]:
imputer_extra_mapper = DataFrameMapper([
    (["Description_0", "Price"], ),
    (["Description_1", "Price"], ),
    ("Description_2", Imputer(strategy="mean")),
    ("Description_3", Imputer(strategy="mean"))
])

In [35]:
data_transform.loc[:, data_transform.isna().any()]

Unnamed: 0,Description_0,Description_1,Description_3
0,1.6,92.0,5.0
1,,258.0,5.0
2,4.0,560.0,5.0
3,1.3,70.0,3.0
4,1.4,95.0,5.0
5,,313.0,5.0
6,2.0,140.0,5.0
7,1.5,110.0,5.0
8,1.6,115.0,5.0
9,,63.0,4.0


In [None]:
imputer_mapper = DataFrameMapper([
    ("")
])

In [11]:
data_transform.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 166695 entries, 0 to 166694
Data columns (total 12 columns):
age              166695 non-null int64
Make             166695 non-null object
Model            166695 non-null object
Model_year       166695 non-null int64
Mileage          166695 non-null float64
Fuel             166695 non-null object
Gearbox          166695 non-null object
Description_0    136637 non-null float64
Description_1    159977 non-null float64
Description_2    166695 non-null int64
Description_3    165406 non-null float64
Description_4    166695 non-null object
dtypes: float64(4), int64(3), object(5)
memory usage: 15.3+ MB


In [12]:
data_transform.head()

Unnamed: 0,age,Make,Model,Model_year,Mileage,Fuel,Gearbox,Description_0,Description_1,Description_2,Description_3,Description_4
0,4,PEUGEOT,308,2014,94341.0,Diesel,mécanique,1.6,92.0,4,5.0,blanc banquise
1,3,BMW,X6,2015,39051.0,Diesel,automatique,,258.0,16,5.0,carbonschwarz metallise
2,4,AUDI,RS6,2014,75381.0,Essence,automatique,4.0,560.0,47,5.0,noir panthere cristal
3,11,FORD,KA,2007,92282.0,Essence,mécanique,1.3,70.0,5,3.0,jaune
4,0,FIAT,TIPO,2018,10.0,Essence,mécanique,1.4,95.0,5,5.0,blanc verni


In [13]:
data_transform.isnull().sum()

age                  0
Make                 0
Model                0
Model_year           0
Mileage              0
Fuel                 0
Gearbox              0
Description_0    30058
Description_1     6718
Description_2        0
Description_3     1289
Description_4        0
dtype: int64

In [28]:
data_point = df.loc[0:1,:].to_dict('list')
data_point

{'Price': [11220.0, 57526.0],
 'Make': ['PEUGEOT', 'BMW'],
 'Model': ['308', 'X6'],
 'Model_year': [2014, 2015],
 'Mileage': ['94341.0 km', '39051.0 km'],
 'Fuel': ['Diesel', 'Diesel'],
 'Gearbox': ['mécanique', 'automatique'],
 'Online': ['06/07/2018 à 3h47', '03/04/2018 à 16h41'],
 'Description': ["modele: 308 (2E GENERATION), version: 1.6 HDI FAP 92CH BUSINESS 5P, puissance_fiscale: 4, portes: 5.0, options: banquette arrière 3 places;volant cuir;cache bagages;airbag frontal;banquette 1/3 - 2/3;sièges rang 2 rabattables à plat;vitres teintées;boucliers av et ar couleur caisse;phares av. de jour à LED;airbags rideaux;rétroviseurs électriques et dégivrants;siège conducteur avec réglage lombaire;vitres électriques;feux de freinage d'urgence;banquette rabattable;feux ar. à LED;AFU;contrôle de pression des pneus;airbags front. + lat.;blanc banquise;lampes de lecture à l'arrière;sièges réglables en hauteur;alarme d'oubli d'extinction des feux;EBD;volant sport;température extérieure;système

In [30]:
data_mapper.transform(pd.DataFrame(data_point))

Unnamed: 0,age,Make,Model,Model_year,Mileage,Fuel,Gearbox,Description_0,Description_1,Description_2,Description_3,Description_4
0,4,PEUGEOT,308,2014,94341.0,Diesel,mécanique,1.6,92,4,5.0,blanc banquise
1,3,BMW,X6,2015,39051.0,Diesel,automatique,,258,16,5.0,carbonschwarz metallise


In [60]:
ts = TypeSelector("object")
feature_category = ts.fit_transform(data_transform).head()
feature_category

Unnamed: 0,Make,Model,Fuel,Gearbox,Description_4
0,PEUGEOT,308,Diesel,mécanique,blanc banquise
1,BMW,X6,Diesel,automatique,carbonschwarz metallise
2,AUDI,RS6,Essence,automatique,noir panthere cristal
3,FORD,KA,Essence,mécanique,jaune
4,FIAT,TIPO,Essence,mécanique,blanc verni


## Pipeline

In [None]:
preprocess_pipeline = make_pipeline(
    FeatureUnion(transformer_list=[
        ("categorical_features", make_pipeline(
            TypeSelector("object"),
            LabelEncoder(),
        ))
    ])
)

regression_pipeline = make_pipeline(
    data_mapper,
    preprocess_pipeline,
    RandomForestRegressor(**best_param_cv)
)