In [1]:
from driver.MongoDriver import MongoDriver
import util.Constants as const
from service.DataPreparationHandler import get_data
from util.PandasUtils import PandasUtils
import DisplayHelper
from DisplayHelper import *
from pprint import PrettyPrinter
from util.DataPreparationUtils import *
from stubutils.StubUtils import open_file
from datetime import datetime
from scipy import stats
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier as rfc
from sklearn.preprocessing import OneHotEncoder as ohe
import xgboost as xgb
import numpy
import pandas
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.wrappers.scikit_learn import KerasClassifier
from keras.utils import np_utils
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from keras import optimizers

Using TensorFlow backend.


In [2]:
db_instance = MongoDriver.get_instance().get_db_instance(const.DB_INSTANCE)
data = get_data(db_instance, 'normalized_data')
dataframe = PandasUtils.get_dataframe(data, const.JSON_STRUCTURE)

Constructor called


In [3]:
#Create delta columns for fields "from" and "to"
dataframe['search_volume'] = (dataframe['search_volume_to'] + dataframe['search_volume_from'])/2
dataframe['search_milleage'] = (dataframe['search_milleage_to'] + dataframe['search_milleage_from'])/2
dataframe['search_price'] = (dataframe['search_price_to'] + dataframe['search_price_from'])/2
dataframe['search_year'] = (dataframe['search_year_to'] + dataframe['search_year_from'])/2

In [4]:
for column in const.NUMERIC_COLUMNS_REQUIRE_ANALYSIS:
        quartiles = get_percentiles_for_numeric_column(dataframe,column)
        dataframe.loc[ dataframe[column] <= quartiles['values'][1], column] = 0
        dataframe.loc[(dataframe[column] > quartiles['values'][1]) & (dataframe[column] <= quartiles['values'][2]), column] = 1
        dataframe.loc[(dataframe[column] > quartiles['values'][2]) & (dataframe[column] <= quartiles['values'][3]), column]   = 2
        dataframe.loc[ dataframe[column] > quartiles['values'][3], column] = 3
        dataframe[column] = dataframe[column].astype(float)

In [5]:
for column_name in const.DATE_COLUMNS:
        dataframe[column_name] = dataframe.apply(lambda row: get_season_by_utcdate(row[column_name]),axis=1)
        dataframe[column_name] = dataframe[column_name].astype(float)

In [6]:
dataframe = pandas.get_dummies(dataframe, columns=["search_model", "search_rigion", "search_country", "search_city", "search_body", "search_transmission", "search_wheel"], 
                  prefix=["model","rigion", "country", "city","body","transmission","wheel"])

In [7]:
#Drop unhandled columns
for column in const.COLUMNS_TO_DROP:
    dataframe = dataframe.drop(column, axis=1)
    
# dataframe = dataframe.drop("search_city", axis=1)
#dataframe = dataframe.drop("search_rigion", axis=1)
#dataframe = dataframe.drop("search_model", axis=1)
#dataframe = dataframe.drop("search_year", axis=1)
#dataframe = dataframe.drop("search_volume", axis=1)
#dataframe = dataframe.drop("search_milleage", axis=1)
#dataframe = dataframe.drop("search_price", axis=1)
#dataframe = dataframe.drop("search_year", axis=1)

In [8]:
dataframe["a"] = dataframe.apply(lambda row: (row['search_volume'] + 1)/(row['search_milleage']+1),axis=1).astype(float)
dataframe["b"] = dataframe.apply(lambda row: (row['search_volume'] + 1)/(row['search_price']+1),axis=1).astype(float)
dataframe["c"] = dataframe.apply(lambda row: (row['search_volume'] + 1)/(row['search_year']+1),axis=1).astype(float)
dataframe["d"] = dataframe.apply(lambda row: (row['search_milleage'] + 1)/(row['search_volume']+1),axis=1).astype(float)
dataframe["f"] = dataframe.apply(lambda row: (row['search_milleage'] + 1)/(row['search_price']+1),axis=1).astype(float)
dataframe["e"] = dataframe.apply(lambda row: (row['search_milleage'] + 1)/(row['search_year']+1),axis=1).astype(float)
dataframe["g"] = dataframe.apply(lambda row: (row['search_year'] + 1)/(row['search_volume']+1),axis=1).astype(float)
dataframe["h"] = dataframe.apply(lambda row: (row['search_year'] + 1)/(row['search_price']+1),axis=1).astype(float)
dataframe["i"] = dataframe.apply(lambda row: (row['search_year'] + 1)/(row['search_milleage']+1),axis=1).astype(float)

In [10]:
# fix random seed for reproducibility
seed = 200
numpy.random.seed(seed)

In [9]:
X_data = dataframe.drop("search_marka", axis=1)          # data: Features
Y_data = dataframe["search_marka"]                       # data: Labels

X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.2, random_state=0)

X = X_train.drop(['user_id'], axis=1).values
X_test = X_test.drop(['user_id'], axis=1).values
Y = Y_train.values
Y_test = Y_test.values

# encode class values as integers
uniques_train, ids_train = np.unique(Y_train, return_inverse=True)
uniques_test, ids_test = np.unique(Y_test, return_inverse=True)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_train = np_utils.to_categorical(ids_train, len(Y_data.unique()))
# convert integers to dummy variables (i.e. one hot encoded)
dummy_test = np_utils.to_categorical(ids_test, len(Y_data.unique()))

In [10]:
from keras.callbacks import Callback
class EarlyStoppingByLossVal(Callback):
    def __init__(self, monitor='val_loss', value=0.00001, verbose=0):
        super(Callback, self).__init__()
        self.monitor = monitor
        self.value = value
        self.verbose = verbose

    def on_epoch_end(self, epoch, logs={}):
        current = logs.get(self.monitor)
        if current is None:
            warnings.warn("Early stopping requires %s available!" % self.monitor, RuntimeWarning)

        if current > self.value:
            if self.verbose > 0:
                print("Epoch %05d: early stopping THR" % epoch)
            self.model.stop_training = True
callbacks = [
    EarlyStoppingByLossVal(monitor='val_acc', value=0.92, verbose=1),
    # EarlyStopping(monitor='val_loss', patience=2, verbose=0),
]

In [11]:
# define baseline model
def baseline_model():
    model = Sequential()
    
    model.add(Dense(X.shape[1]-1, input_dim=X.shape[1], activation='relu'))
    model.add(Dropout(0.5))
#     model.add(Dense(X.shape[1]*2, input_dim=X.shape[1], activation='relu'))
#     model.add(Dense(X.shape[1], activation='relu'))
    model.add(Dense(round(X.shape[1]*2.5), input_dim=X.shape[1], activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(round(X.shape[1]*2.5), input_dim=X.shape[1], activation='relu'))
#     model.add(Dense(X.shape[1]-1, activation='relu'))
#     model.add(Dense(8, activation='relu'))
    model.add(Dense(len(dataframe['search_marka'].unique()), activation='softmax'))
#     model.add(Dense(3, activation='softmax'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [12]:
model = baseline_model()
model.fit(X, dummy_train, batch_size=50, epochs=3000, validation_data=(X_test, dummy_test), callbacks=callbacks)

Train on 7999 samples, validate on 2000 samples
Epoch 1/3000
Epoch 2/3000
Epoch 3/3000
Epoch 4/3000
Epoch 5/3000
Epoch 6/3000
Epoch 7/3000
Epoch 8/3000
Epoch 9/3000
Epoch 10/3000
Epoch 11/3000
Epoch 12/3000
Epoch 13/3000
Epoch 14/3000
Epoch 15/3000
Epoch 16/3000
Epoch 17/3000
Epoch 18/3000
Epoch 19/3000
Epoch 20/3000
Epoch 21/3000
Epoch 22/3000

KeyboardInterrupt: 

In [None]:
pred = model.predict(X_test)
corrected_pred = uniques_test[numpy.round(pred).argmax(1)]
accuracy_score(Y_test, corrected_pred)

In [None]:
kfold = KFold(n_splits=10, shuffle=True, random_state=seed)
estimator = KerasClassifier(build_fn=baseline_model, epochs=20, batch_size=200, verbose=1)

In [None]:
score = model.evaluate(X_test, dummy_y_test, batch_size=32)
print('Accuracy score = {}'.format(score))

In [44]:
X.shape

(7999, 3643)