In [65]:
import numpy as np
import pandas as pd
import pymysql as ms
import sklearn.preprocessing as sp
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor

import sklearn.metrics as met

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [71]:
data = pd.read_csv('turbo_az_data.csv')

In [72]:
data['qiymet_azn'] = np.select(
    condlist=[
        data.qiymet_valyuta == '$', data.qiymet_valyuta == 'AZN', data.qiymet_valyuta == '€'],
    choicelist=[
        data.qiymet * 1.7, data.qiymet, data.qiymet * 2.1],
    default=data.qiymet)

columns_drop = ['tel1', 'tel2', 'tel3', 'tel4', 'tel5', 'tel6', 'sheher', 'elan_tarix', 'elan_id', 'satici',
                'date',
                'id', 'avtosalon',
                'baxis_sayi',
                'kredit', 'barter', 'info', 'satilib', 'elan_tarix_title']


In [106]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,date,avtosalon,salon_yn,marka,model,shekil_sayi,satilib,baxis_sayi,...,qapanma,radar,kondisioner,podoqrev,deri,ksenon,kamera,perde,ventilyasiya,qiymet_azn
0,0,1,2018-03-08 22:12:54,none,0,Mercedes,C 200,9,yox,107,...,1,0,1,0,0,1,0,0,0,9750.0
1,1,2,2018-03-08 22:12:54,none,0,Mitsubishi,Pajero,5,yox,78,...,1,1,1,0,0,0,1,0,0,13800.0
2,2,3,2018-03-08 22:12:54,none,0,Mercedes,C 220,7,yox,118,...,1,0,1,0,0,0,0,0,0,9900.0
3,3,4,2018-03-08 22:12:54,none,0,Volkswagen,Touareg,10,yox,1639,...,1,1,1,1,1,1,1,1,1,32810.0
4,4,5,2018-03-08 22:12:54,none,0,Hyundai,Sonata,8,yox,227,...,1,1,1,1,1,0,0,1,0,18700.0


In [74]:
class CustomCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, _X, y=None):
        X = _X.copy(deep=True)
        print(X.yurush.dtype)
        X['yurush'] = X.yurush.str.strip('km').str.replace(' ', '').astype('float32')
        X['muherrik_guc'] = X.muherrik_guc.str.strip(' a.g.').str.replace(' ', '').astype('float16')
        X['car_age'] = 2018 - X.il
        return X

In [75]:
class CustomPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, _X, y=None):
        X = _X.copy(deep=True)
        X[['qiymet_azn', 'yurush']] = X[['qiymet_azn', 'yurush']] / 1000
        return X

In [76]:
class CustomOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self.feature_names = feature_names

    def fit(self, X, y=None):
        return self

    def transform(self, _X, y=None):
        X = _X.copy(deep=True)
        for feature in self.feature_names:
            cols = pd.get_dummies(X[feature], drop_first=True, prefix=f'{feature}').columns.values
            X[cols] = pd.get_dummies(X[feature], drop_first=True, prefix=f'{feature}')
            X = X.drop([feature], axis=1)
        return X

In [77]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self.feature_names = feature_names

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.loc[:, self.feature_names].copy(deep=True)

In [78]:
class AverageCategoricalConverter(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self.feature_names = feature_names

    def fit(self, X, y=None):
        return self

    def transform(self, _X, y=None):
        X = _X.copy(deep=True)
        for feature in self.feature_names:
            feature_avg = X[[feature, 'qiymet_azn']].groupby(by=feature).mean()
            feature_avg.rename(columns={'qiymet_azn': f'{feature}_avg'}, inplace=True)
            X = X.merge(feature_avg, how='left', left_on=feature, right_index=True).drop([feature], axis=1)
        return X


In [79]:
class PolynomialFeaturesGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, pol_features):
        self.pol_features = pol_features

    def fit(self, X, y=None):
        return self

    def transform(self, _X, y=None):
        X = _X.copy(deep=True)

        if self.pol_features is None:
            pols = X
            self.pol_features = pols.columns
        else:
            pols = X[self.pol_features]

        poly = sp.PolynomialFeatures()

        poly_df = pd.DataFrame(poly.fit_transform(pols), index=X.index)
        poly_df = poly_df.iloc[:, len(self.pol_features) + 1:]
        cols = []
        for i in range(len(self.pol_features)):
            for j in range(i, len(self.pol_features)):
                cols.append(self.pol_features[i] + '_' + self.pol_features[j])
        poly_df.columns = cols
        X = X.join(poly_df, how='left')
        return X

In [80]:
class ColumnDropper():
    def __init__(self, columns):
        self.columns = columns

    def transform(self, _X, y=None):
        X = _X.copy(deep=True)
        return X.drop(self.columns, axis=1)

    def fit(self, X, y=None):
        return self

In [81]:
features = ['muherrik_hecm', 'muherrik_guc',
            'yurush', 'car_age', 'marka_avg', 'yanacaq_avg', 'ban_novu_avg', 'reng_avg']
target = 'qiymet_azn'

In [82]:
train, test = train_test_split(data, test_size=0.25, random_state=5)

In [108]:
train.shape, test.shape

((19251, 51), (6417, 51))

In [84]:
pipe = Pipeline([
    ('column_dropper1', ColumnDropper(columns_drop)),
    ('cleaner', CustomCleaner()),
    ('onehot_encoder', CustomOneHotEncoder(['oturucu', 'karobka', 'yeni'])),
    ('categorical_averageprice', AverageCategoricalConverter(['marka', 'yanacaq', 'ban_novu', 'reng', 'model'])),
    ('column_dropper2', ColumnDropper(['qiymet', 'qiymet_azn', 'qiymet_valyuta', 'il'])),
    ('polynomial_features', PolynomialFeaturesGenerator(None)),
    ('scaler', RobustScaler(quantile_range=[0.1, 0.9])),
    ('estimator', RandomForestRegressor())
])

In [85]:
y_train = np.log(train[target])
y_test = np.log(test[target])

In [87]:
pipe.fit(train, y_train)

object


In [88]:
y_pred_train = pipe.predict(train)
y_pred_test = pipe.predict(test)

object
object


In [89]:
print("train mean error:", met.mean_absolute_error(train[target], np.exp(y_pred_train)),
      ", train median error:", met.median_absolute_error(train[target], np.exp(y_pred_train)))

train mean error: 939.4326283565636 , train median error: 307.9590054627042


In [90]:
print("test mean error:", met.mean_absolute_error(test[target], np.exp(y_pred_test)),
      ", test median error:", met.median_absolute_error(test[target], np.exp(y_pred_test)))

test mean error: 2772.5140894872993 , test median error: 942.0215420954428
