In [None]:
import numpy as np
import pandas as pd
import pymysql as ms
import seaborn as sns
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import sklearn.preprocessing as sp
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso
from sklearn.model_selection import RepeatedKFold

import sklearn.linear_model as lm
import sklearn.tree as tree
import sklearn.svm as svm
import sklearn.ensemble as ens
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor

import sklearn.metrics as met

from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import dill
import pickle
import requests
from bs4 import BeautifulSoup
import pymysql

In [None]:
connection = pymysql.connect(
    host='localhost',  # 127.0.0.1
    password='12345678',
    database='ds'
)

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [None]:
df = pd.read_sql('select * from bina_db', con=connection).set_index('elan_id').drop(['id'], axis=1)

In [None]:
class ColumnDropper():
    def __init__(self, columns):
        self.columns = columns

    def transform(self, _X, y=None):
        X = _X.copy(deep=True)
        return X.drop(self.columns, axis=1)

    def fit(self, X, y=None):
        return self

In [None]:
class FeatureSelector(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self.feature_names = feature_names

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        return X.loc[:, self.feature_names].copy(deep=True)

In [None]:
class AverageCategoricalConverter(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self.feature_names = feature_names

    def fit(self, X, y=None):
        return self

    def transform(self, _X, y=None):
        X = _X.copy(deep=True)
        for feature in self.feature_names:
            feature_avg = X[[feature, 'qiymet']].groupby(by=feature).mean()
            feature_avg.rename(columns={'qiymet': f'{feature}_avg'}, inplace=True)
            X = X.merge(feature_avg, how='left', left_on=feature, right_index=True)
        return X


In [None]:
class DataCleaner(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, _X, y=None):
        X = _X.copy(deep=True)
        X['mertebe'] = X.mertebe.apply(lambda x: x.split('/')[0].strip()).astype(int)
        X['sahe'] = X['sahe'].apply(lambda x: x.split(' ')[0]).astype(float)
        X['qiymet'] = X['qiymet'].apply(lambda x: ''.join(x.split(' ')[:-1])).astype(int)
        X['kvm_qiymet'] = X['kvm_qiymet'].apply(lambda x: ''.join(x.split(' ')[:-1])).astype(float)
        X['otaq_say'] = X['otaq_say'].astype(int)
        X['baxis_say'] = X['baxis_say'].astype(int)
        X['kupca'] = X['kupca'].apply(lambda x: 1 if x == 'var' else 0)
        X['ipoteka'] = X['ipoteka'].apply(lambda x: 1 if x == 'var' else 0)
        return X

In [None]:
class FeatureGenerator(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, _X, y=None):
        X = _X.copy(deep=True)
        X['mertebe_bina'] = X.mertebe.apply(lambda x: x.split('/')[1].strip()).astype(int)
        X['valyuta'] = X['qiymet'].apply(lambda x: x.split(' ')[-1].strip())

        df_tecili = pd.read_sql("select elan_id,info from bina_db where lower(info) like '%t_cil_%'",
                                connection).set_index('elan_id')
        temp_df = X.copy(deep=True)
        last_df = temp_df.merge(df_tecili[['info']], on='elan_id', how='left', suffixes=['', '_x'])
        last_df['tecili'] = last_df['info_x'].isna().apply(lambda x: 1 if x is False else 0)
        last_df = last_df.drop(['info_x'], axis=1)
        return last_df

In [None]:
class PolynomialFeaturesGenerator(BaseEstimator, TransformerMixin):
    def __init__(self, pol_features):
        self.pol_features = pol_features

    def fit(self, X, y=None):
        return self

    def transform(self, _X, y=None):
        X = _X.copy(deep=True)

        if self.pol_features is None:
            pols = X
            self.pol_features = pols.columns
        else:
            pols = X[self.pol_features]

        poly = sp.PolynomialFeatures()

        poly_df = pd.DataFrame(poly.fit_transform(pols), index=X.index)
        poly_df = poly_df.iloc[:, len(self.pol_features) + 1:]
        cols = []
        for i in range(len(self.pol_features)):
            for j in range(i, len(self.pol_features)):
                cols.append(self.pol_features[i] + '_' + self.pol_features[j])
        poly_df.columns = cols
        X = X.join(poly_df, how='left')
        return X

In [None]:
class CustomOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, feature_names):
        self.feature_names = feature_names

    def fit(self, X, y=None):
        return self

    def transform(self, _X, y=None):
        X = _X.copy(deep=True)
        for feature in self.feature_names:
            cols = pd.get_dummies(X[feature], drop_first=True, prefix=f'{feature}').columns.values
            X[cols] = pd.get_dummies(X[feature], drop_first=True, prefix=f'{feature}')
        return X

In [None]:
target = 'qiymet'

In [None]:
polcols = ['sahe','mertebe','tecili','kupca','otaq_say']

In [None]:
preprocessor_pipeline = Pipeline([
    ('feature_generator', FeatureGenerator()),
    ('cleaner', DataCleaner()),
    ('onehot_kateqoriya', CustomOneHotEncoder(['kateqoriya'])),
    ('drop_irrelevant', ColumnDropper(['kateqoriya', 'info', 'satici', 'elan_basliq', 'valyuta','kvm_qiymet'])),
    ('ortaqiymet_adres', AverageCategoricalConverter(['adres'])),
    ('drop_adres', ColumnDropper(['adres'])),
])



In [None]:
data = preprocessor_pipeline.transform(df)

In [None]:
polgenerator = PolynomialFeaturesGenerator(None)

In [None]:
qiymet = data.qiymet

In [None]:
data = polgenerator.transform(data.drop(columns=['qiymet'],axis=1))
data['qiymet'] = qiymet

In [None]:
sns.boxplot(data.qiymet)

# Exploratory Data Analysis

In [None]:
features = ['mertebe', 'otaq_say', 'sahe', 'qiymet', 'mertebe_bina']

In [None]:
temp_data = data[features]

In [None]:
temp_data.info()

In [None]:
temp_data.describe().T

In [None]:
temp_data.shape

In [None]:
# Outlier Analysis
fig, axs = plt.subplots(2, 3, figsize=(15, 10))
plt1 = sns.boxplot(temp_data['qiymet'], ax=axs[0, 0])
plt2 = sns.boxplot(temp_data['sahe'], ax=axs[0, 1])
plt3 = sns.boxplot(temp_data['otaq_say'], ax=axs[0, 2])
plt1 = sns.boxplot(temp_data['mertebe'], ax=axs[1, 0])
plt2 = sns.boxplot(temp_data['mertebe_bina'], ax=axs[1, 1])

plt.tight_layout()

In [None]:
plt.boxplot(temp_data.qiymet)
low_threshold = temp_data.qiymet.quantile(0.0001)
high_threshold = temp_data.qiymet.quantile(0.99)
temp_data = temp_data[(temp_data.qiymet >= low_threshold) & (temp_data.qiymet <= high_threshold)]

In [None]:
temp_data.sahe.describe()

In [None]:
plt.boxplot(temp_data.sahe)
low_threshold = temp_data.sahe.quantile(0.001)
high_threshold = temp_data.sahe.quantile(0.99)
temp_data = temp_data[(temp_data.sahe >= low_threshold) & (temp_data.sahe <= high_threshold)]

In [None]:
temp_data.columns

In [None]:
sns.pairplot(temp_data)
plt.show()

In [None]:
plt.figure(figsize=(16, 10))
sns.heatmap(temp_data.corr(), annot=True, cmap="YlGnBu")
plt.show()

In [None]:
data = temp_data.merge(data.drop(features, axis=1), on='elan_id')

In [None]:
data

In [None]:
y = data[target]
X = data.drop([target], axis=1)
X

In [None]:
sc = StandardScaler()
X_scaled = sc.fit_transform(X)
X_scaled = pd.DataFrame(data=X_scaled, columns=X.columns)

In [None]:
X_scaled

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3)
X_train.shape, X_test.shape

In [None]:
X_scaled

In [None]:

alphas = np.linspace(0, 10, 10)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=100)

hyper_param = {'alpha': alphas}

lasso_reg = Lasso()

grid_lasso = GridSearchCV(estimator=lasso_reg,
                          param_grid=hyper_param,
                          scoring='r2',
                          cv=cv,
                          n_jobs=-1,
                          return_train_score=True
                          )

grid_lasso.fit(X_train, y_train)

In [None]:
grid_lasso.best_params_

In [None]:

from sklearn.linear_model import Ridge

model_params = {
    'tree': {
        'model': DecisionTreeRegressor(),
        'params': {
            "max_depth": [None, 1, 3, 5, 7, 9, 11, 12],
        }
    },
    'ridge_regression': {
        'model': Ridge(),
        'params': {
            'alpha': np.arange(0, 1, 0.01)
        }
    }
}

In [None]:
l_reg = LinearRegression()
l_reg.fit(X_train, y_train)
l_reg.score(X_test,y_test)

In [None]:
scores = []

for model_name, mp in model_params.items():
    grids = GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False, n_jobs=-1)
    grids.fit(X_train, y_train)
    scores.append({
        'model': grids,
        'model_name': model_name,
        'best_score': grids.best_score_,
        'best_params': grids.best_params_
    })



In [None]:
scores.append({
    'model': l_reg,
    'model_name': 'linear_regression',
    'best_score': l_reg.score(X_test, y_test),
    'best_params': 'default'
})
scores.append({
    'model': grid_lasso,
    'model_name': 'lasso_regression',
    'best_score': grid_lasso.best_score_,
    'best_params': grid_lasso.best_params_
})

rand_forest = RandomForestRegressor()
rand_forest.fit(X_train,y_train)
scores.append({
    'model' : rand_forest,
    'model_name' : 'random_forest',
    'best_score' : rand_forest.score(X_test,y_test),
    'best_params' : 'default'
})

In [None]:
for model_score in scores:
    curr_model = model_score['model']
    model_score['mean_abserr_train'] = met.mean_absolute_error(y_train, curr_model.predict(X_train))
    model_score['median_abserr_train'] = met.median_absolute_error(y_train, curr_model.predict(X_train))

    model_score['mean_abserr_test'] = met.mean_absolute_error(y_test, curr_model.predict(X_test))
    model_score['median_abserr_test'] = met.median_absolute_error(y_test, curr_model.predict(X_test))


In [None]:
model_scores = pd.DataFrame(scores, columns=['model_name', 'best_score', 'best_params','mean_abserr_train','median_abserr_train','mean_abserr_test','median_abserr_test'])
model_scores