In [None]:
import json
import pandas as pd
import numpy as np
import math as math
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.model_selection as model_selection
import sklearn.tree as tree
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
import time
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
import catboost
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV


In [None]:
def describe_column(meta):
    """
    Utility function for describing a dataset column (see below for usage)
    """
    def f(x):
        d = pd.Series(name=x.name, dtype=object)
        m = next(m for m in meta if m['name'] == x.name)
        d['Type'] = m['type']
        d['#NaN'] = x.isna().sum()
        d['Description'] = m['desc']
        if m['type'] == 'categorical':
            counts = x.dropna().map(dict(enumerate(m['cats']))).value_counts().sort_index()
            d['Statistics'] = ', '.join(f'{c}({n})' for c, n in counts.items())
        elif m['type'] == 'real' or m['type'] == 'integer':
            stats = x.dropna().agg(['mean', 'std', 'min', 'max'])
            d['Statistics'] = ', '.join(f'{s}={v :.1f}' for s, v in stats.items())
        elif m['type'] == 'boolean':
            counts = x.dropna().astype(bool).value_counts().sort_index()
            d['Statistics'] = ', '.join(f'{c}({n})' for c, n in counts.items())
        else:
            d['Statistics'] = f'#unique={x.nunique()}'
        return d
    return f

def describe_data(data, meta):
    desc = data.apply(describe_column(meta)).T
    desc = desc.style.set_properties(**{'text-align': 'left'})
    desc = desc.set_table_styles([ dict(selector='th', props=[('text-align', 'left')])])
    return desc 

In [None]:
df_train = pd.read_csv('Training_clean.csv')
def root_mean_squared_log_error(y_true, y_pred):
    # Alternatively: sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5
    assert (y_true >= 0).all() 
    assert (y_pred >= 0).all()
    log_error = np.log1p(y_pred) - np.log1p(y_true)  # Note: log1p(x) = log(1 + x)
    return np.mean(log_error ** 2) ** 0.5

df_test = pd.read_csv('Test_clean.csv')

In [None]:
y1 = df_train.price.values
X1 = df_train.drop(['id','price', "street", 'address'], axis=1)
X1_test = df_test.drop(['id', "street", 'address'], axis=1)
categorical_features_indices = np.where(X1.dtypes != np.float64)[0]
categorical_features_indices = np.where(X1_test.dtypes != np.float64)[0]

In [None]:
model_CBR = CatBoostRegressor()

In [None]:
parameters = {'depth'         : [6,8,10,12],
                  'learning_rate' : [0.005, 0.01, 0.05,0.1],
                  'iterations'    : [3500,6500,8500,10000]
                 }

In [None]:
grid = GridSearchCV(estimator=model_CBR, param_grid = parameters, cv = 5, n_jobs=-1)
grid.fit(X1, y1)
print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

KeyboardInterrupt: 

In [None]:
regressor = CatBoostRegressor(depth= 8, iterations= 3500, learning_rate= 0.05)

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X1, y1, test_size=0.2, random_state=1, shuffle=True)
X_train.shape, X_val.shape, y_train.shape,y_val.shape, X1_test.shape

((18628, 23), (4657, 23), (18628,), (4657,), (9937, 23))

In [None]:
regressor.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_val, y_val),plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

2506:	learn: 2552849.3389378	test: 29047860.3387966	best: 29047860.3387966 (2506)	total: 5m 42s	remaining: 2m 15s
2507:	learn: 2552492.0951848	test: 29047981.6111975	best: 29047860.3387966 (2506)	total: 5m 42s	remaining: 2m 15s
2508:	learn: 2551627.9560319	test: 29048402.0378055	best: 29047860.3387966 (2506)	total: 5m 42s	remaining: 2m 15s
2509:	learn: 2551394.4364331	test: 29048382.0469043	best: 29047860.3387966 (2506)	total: 5m 42s	remaining: 2m 15s
2510:	learn: 2551064.4718219	test: 29048417.2851983	best: 29047860.3387966 (2506)	total: 5m 42s	remaining: 2m 14s
2511:	learn: 2549906.5134751	test: 29048441.0004498	best: 29047860.3387966 (2506)	total: 5m 42s	remaining: 2m 14s
2512:	learn: 2549643.6290848	test: 29048468.1285904	best: 29047860.3387966 (2506)	total: 5m 42s	remaining: 2m 14s
2513:	learn: 2548847.4125031	test: 29048458.6175022	best: 29047860.3387966 (2506)	total: 5m 43s	remaining: 2m 14s
2514:	learn: 2547485.2348311	test: 29048471.9364628	best: 29047860.3387966 (2506)	total:

<catboost.core.CatBoostRegressor at 0x7fc545b518e0>

In [None]:
print(regressor.score(X_train, y_train))
from sklearn.metrics import r2_score

y_pred = regressor.predict(X_val)
y_pred_train = regressor.predict(X_train)
#y_pred = y_pred.astype(int)
print(regressor.score(X_val, y_val)), print(r2_score(y_pred, regressor.predict(X_val)))

a = root_mean_squared_log_error(y_train, y_pred_train)
b = root_mean_squared_log_error(y_val, y_pred)
a, b

0.9805792102421432
0.6519450697577542
1.0


(0.12288471299689159, 0.17130079155218272)

In [None]:
y_pred1 = regressor.predict(X1_test)


In [None]:
submission = pd.DataFrame()
submission['id'] = df_test.id
submission['price_prediction'] = y_pred1
submission.to_csv('sample_submission6.csv', index = False)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=99c1e030-46bc-40fa-835c-0661eae0d488' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>