In [None]:
# basic
import pandas as pd
import numpy as np
import random
import pickle
# import json
# import re

# vizu
import matplotlib.pyplot as plt
# import seaborn as sns

# modelling
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# from sklearn.preprocessing import label_binarize
# from sklearn.pipeline import make_pipeline
# from sklearn import metrics
# from sklearn.metrics import confusion_matrix

# from sklearn.metrics import mean_squared_error
# from sklearn.neighbors import KNeighborsRegressor
# from sklearn import svm
# import xgboost as xgb

In [None]:
myrandom = 42
df = pd.read_csv('nyaklanc.csv')
details = ["Anyag", "Nyaklánc kapocs", "Motívum", "Célcsoport", "Ékszer súlya", "Tisztaság", "Arany színe", "Hossz", "Szélesség", "Magasság", "Kő"]
df.shape

check target - "price"

In [None]:
df['price'].hist(bins=50)

check feature - "weight"

In [None]:
df['weight'] = df['Ékszer súlya'].apply(lambda x: x.replace(' gramm', '') if pd.notnull(x) else x)
df['weight'] = df['weight'].astype(float)
df['weight'].hist(bins=50)

In [None]:
# Based on manual check the extreme_heavy_item is excluded
extreme_heavy_item = df.loc[df['weight']>30, 'url'].tolist()
if extreme_heavy_item:
    extreme_heavy_item = extreme_heavy_item[0]
    z_score_of_extreme_heavy_item = (df['weight'] - df['weight'].mean())/df['weight'].std(ddof=0).max()
    print(f'dropped out URL with weight-zscore {round(z_score_of_extreme_heavy_item.max(),2)}: {extreme_heavy_item}')
    df = df.loc[df['url'] != extreme_heavy_item].copy()

check feature - "width"

In [None]:
def get_width(szelesseg):
    if pd.isnull(szelesseg):
        return None
    else:
        try:
            szelesseg = float(szelesseg.replace(' mm', '').replace(',', '.'))
        except:
            print(f'failed to convert: {szelesseg}')
            szelesseg = None
    return szelesseg

df['width'] = df['Szélesség'].apply(get_width)

In [None]:
df['width'].hist(bins=50)

In [None]:
# Based on manual check the extreme_wide_item is excluded
extreme_wide_item = df.loc[df['width']>100, 'url'].tolist()
if extreme_wide_item:
    extreme_wide_item = extreme_wide_item[0]
    z_score_of_extreme_wide_item = ((df['width'] - df['width'].mean())/df['width'].std(ddof=0)).max()
    print(f'dropped out URL with weight-zscore {round(z_score_of_extreme_wide_item.max(),2)}: {extreme_wide_item}')
    df = df.loc[df['url'] != extreme_wide_item]

Modelling

In [None]:
predictors = ['weight', 'width']
target = 'price'
summary = {}
df = df.dropna(subset=predictors + [target])
print(f'After dropping NANs sample size is: {df.shape[0]}')

In [None]:
data_train, data_test, target_train, target_test = train_test_split(df[predictors], df[target], test_size=0.25, random_state=myrandom)
train_df = data_train.join([target_train])
test_df = data_test.join([target_test])

In [None]:
data_train.plot.scatter(x = 'width', y = 'weight')

In [None]:
train_df.plot(x='weight', y='price', style='o')
plt.title('weight vs price')

In [None]:
model=RandomForestRegressor(n_estimators=50, max_depth=3, random_state=myrandom)
model.fit(data_train, target_train)

y_pred = model.predict(data_test)
error_df = pd.DataFrame({'Actual': target_test, 'Predicted': y_pred})
error_df.plot(x='Actual', y='Predicted', style='o')

In [None]:
np.corrcoef([target_test, y_pred])

In [None]:
r2_score(target_test, y_pred)

In [None]:
print('Mean Absolute Error:', sklearn.metrics.mean_absolute_error(target_test, y_pred))
print('Mean Squared Error:', sklearn.metrics.mean_squared_error(target_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(sklearn.metrics.mean_squared_error(target_test, y_pred)))

In [None]:
varimp = {}
for var,weight in zip(data_train.columns, model.feature_importances_):
    varimp[var] = weight
pd.DataFrame({'varimp': varimp}).sort_values('varimp').plot.barh()

In [None]:
pickle.dump(model, open('nyaklanc_first_rf.pickle', 'wb'))