In [None]:
import json
import pandas as pd
import numpy as np
import math as math
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.model_selection as model_selection
import sklearn.tree as tree
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
import time
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
import catboost
from catboost import CatBoostClassifier
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split


In [None]:
def describe_column(meta):
    """
    Utility function for describing a dataset column (see below for usage)
    """
    def f(x):
        d = pd.Series(name=x.name, dtype=object)
        m = next(m for m in meta if m['name'] == x.name)
        d['Type'] = m['type']
        d['#NaN'] = x.isna().sum()
        d['Description'] = m['desc']
        if m['type'] == 'categorical':
            counts = x.dropna().map(dict(enumerate(m['cats']))).value_counts().sort_index()
            d['Statistics'] = ', '.join(f'{c}({n})' for c, n in counts.items())
        elif m['type'] == 'real' or m['type'] == 'integer':
            stats = x.dropna().agg(['mean', 'std', 'min', 'max'])
            d['Statistics'] = ', '.join(f'{s}={v :.1f}' for s, v in stats.items())
        elif m['type'] == 'boolean':
            counts = x.dropna().astype(bool).value_counts().sort_index()
            d['Statistics'] = ', '.join(f'{c}({n})' for c, n in counts.items())
        else:
            d['Statistics'] = f'#unique={x.nunique()}'
        return d
    return f

def describe_data(data, meta):
    desc = data.apply(describe_column(meta)).T
    desc = desc.style.set_properties(**{'text-align': 'left'})
    desc = desc.set_table_styles([ dict(selector='th', props=[('text-align', 'left')])])
    return desc 

In [None]:
apartments = pd.read_csv('apartments_train.csv')
#print(f'Loaded {len(apartments)} apartments')
with open('apartments_meta.json') as f: 
    apartments_meta = json.load(f)
#describe_data(apartments, apartments_meta)
buildings = pd.read_csv('buildings_train.csv')
#print(f'Loaded {len(buildings)} buildings')
with open('buildings_meta.json') as f: 
    buildings_meta = json.load(f)
#print(f'All apartments have an associated building: {apartments.building_id.isin(buildings.id).all()}')
data = pd.merge(apartments, buildings.set_index('id'), how='left', left_on='building_id', right_index=True)
data.head()


Unnamed: 0,id,seller,price,area_total,area_kitchen,area_living,floor,rooms,layout,ceiling,...,address,constructed,material,stories,elevator_without,elevator_passenger,elevator_service,parking,garbage_chute,heating
0,0,3.0,7139520.0,59.2,12.5,31.0,2.0,2.0,,2.65,...,к2.5/2,2021.0,3.0,9.0,0.0,1.0,1.0,1.0,,
1,1,,10500000.0,88.0,14.2,48.0,18.0,3.0,1.0,,...,14к3,2010.0,3.0,25.0,0.0,1.0,1.0,1.0,,0.0
2,2,3.0,9019650.0,78.5,22.5,40.8,12.0,3.0,,2.65,...,38,2021.0,3.0,15.0,0.0,1.0,1.0,1.0,,
3,3,,10500000.0,88.0,14.0,48.0,18.0,3.0,,,...,14к3,2010.0,3.0,25.0,0.0,1.0,1.0,1.0,,0.0
4,4,,13900000.0,78.0,17.0,35.0,7.0,2.0,1.0,2.9,...,1к3,2017.0,2.0,15.0,0.0,1.0,1.0,1.0,0.0,0.0


In [None]:
def root_mean_squared_log_error(y_true, y_pred):
    # Alternatively: sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5
    assert (y_true >= 0).all() 
    assert (y_pred >= 0).all()
    log_error = np.log1p(y_pred) - np.log1p(y_true)  # Note: log1p(x) = log(1 + x)
    return np.mean(log_error ** 2) ** 0.5
    
apartments_test = pd.read_csv('apartments_test.csv')
buildings_test = pd.read_csv('buildings_test.csv')
#print(f'All test apartments have an associated building: {apartments_test.building_id.isin(buildings_test.id).all()}')
data_test = pd.merge(apartments_test, buildings_test.set_index('id'), how='left', left_on='building_id', right_index=True)
#print(f'Number of train samples: {len(data)}')
#print(f'Number of test samples:  {len(data_test)}')
#data_test.head()

In [None]:

"""data_train, data_valid = model_selection.train_test_split(data, test_size=0.33, stratify=np.log(data.price).round())
X_train = data_train[['latitude', 'longitude']]
y_train = data_train.loc[X_train.index].price
X_valid = data_valid[['latitude', 'longitude']]
y_valid = data_valid.loc[X_valid.index].price
model = tree.DecisionTreeRegressor().fit(X_train, y_train)

y_train_hat = model.predict(X_train)
y_valid_hat = model.predict(X_valid)
print(f'Train RMSLE: {root_mean_squared_log_error(y_true=y_train, y_pred=y_train_hat) :.4f}')
print(f'Valid RMSLE: {root_mean_squared_log_error(y_true=y_valid, y_pred=y_valid_hat) :.4f}')
"""

"data_train, data_valid = model_selection.train_test_split(data, test_size=0.33, stratify=np.log(data.price).round())\nX_train = data_train[['latitude', 'longitude']]\ny_train = data_train.loc[X_train.index].price\nX_valid = data_valid[['latitude', 'longitude']]\ny_valid = data_valid.loc[X_valid.index].price\nmodel = tree.DecisionTreeRegressor().fit(X_train, y_train)\n\ny_train_hat = model.predict(X_train)\ny_valid_hat = model.predict(X_valid)\nprint(f'Train RMSLE: {root_mean_squared_log_error(y_true=y_train, y_pred=y_train_hat) :.4f}')\nprint(f'Valid RMSLE: {root_mean_squared_log_error(y_true=y_valid, y_pred=y_valid_hat) :.4f}')\n"

In [None]:
total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)


Unnamed: 0,Total,Percent
layout,17642,0.757655
ceiling,11093,0.476401
balconies,10520,0.451793
loggias,10520,0.451793
condition,9810,0.421301
seller,8830,0.379214
garbage_chute,8811,0.378398
heating,8161,0.350483
windows_court,8072,0.346661
windows_street,8072,0.346661


In [None]:
total = data_test.isnull().sum().sort_values(ascending=False)
percent = (data_test.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

Unnamed: 0,Total,Percent
layout,7368.0,0.316427
ceiling,5018.0,0.215504
balconies,4835.0,0.207644
loggias,4835.0,0.207644
condition,3959.0,0.170024
seller,3942.0,0.169294
windows_court,3488.0,0.149796
windows_street,3488.0,0.149796
garbage_chute,3426.0,0.147133
heating,3230.0,0.138716


In [None]:
data.fillna(-999,inplace=True)
data_test.fillna(-999,inplace=True)
data.isnull().sum().sum(), data_test.isnull().sum().sum()

(0, 0)

In [None]:
y1 = data.price.values
X1 = data.drop(['price', 'id', 'street', 'address'], axis=1)
X1_test = data_test.drop(['id', 'street', 'address'], axis=1)
categorical_features_indices = np.where(X1.dtypes != np.float64)[0]
categorical_features_indices = np.where(X1_test.dtypes != np.float64)[0]


In [None]:
X_train, X_val, y_train, y_val = train_test_split(X1, y1, test_size=0.1, random_state=1, shuffle=True)
X_train.shape, X_val.shape, y_train.shape,y_val.shape, X1_test.shape

((20956, 30), (2329, 30), (20956,), (2329,), (9937, 30))

In [None]:
from catboost import CatBoostRegressor

model = CatBoostRegressor(random_state=1, iterations=1500, depth=8, learning_rate=.1, loss_function='RMSE')
model.fit(X_train, y_train,cat_features=categorical_features_indices,eval_set=(X_val, y_val),plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

506:	learn: 4444027.9440108	test: 36931373.2005097	best: 36921747.8680604 (493)	total: 32.5s	remaining: 1m 3s
507:	learn: 4439898.6970788	test: 36929545.2574382	best: 36921747.8680604 (493)	total: 32.5s	remaining: 1m 3s
508:	learn: 4433796.8284691	test: 36930261.4866240	best: 36921747.8680604 (493)	total: 32.6s	remaining: 1m 3s
509:	learn: 4429970.6275065	test: 36929571.4933852	best: 36921747.8680604 (493)	total: 32.7s	remaining: 1m 3s
510:	learn: 4428172.3373464	test: 36929554.9357216	best: 36921747.8680604 (493)	total: 32.7s	remaining: 1m 3s
511:	learn: 4422971.9180357	test: 36929219.2061029	best: 36921747.8680604 (493)	total: 32.8s	remaining: 1m 3s
512:	learn: 4419561.8150071	test: 36931876.2615807	best: 36921747.8680604 (493)	total: 32.8s	remaining: 1m 3s
513:	learn: 4415267.1391049	test: 36931520.0585589	best: 36921747.8680604 (493)	total: 32.8s	remaining: 1m 2s
514:	learn: 4410991.7711207	test: 36932146.0133229	best: 36921747.8680604 (493)	total: 32.9s	remaining: 1m 2s
515:	learn

<catboost.core.CatBoostRegressor at 0x7f84891333d0>

In [None]:
print(model.score(X_train, y_train))

0.9973409897009676


In [None]:
from sklearn.metrics import r2_score

y_pred = model.predict(X_val)
y_pred = y_pred.astype(int)
print(model.score(X_val, y_val)), print(r2_score(y_pred, model.predict(X_val)))

0.6106217712634119
0.9999999999999998


(None, None)

In [None]:
df=pd.DataFrame({'Actual': y_val, 'Predicted':y_pred})
df

Unnamed: 0,Actual,Predicted
0,50000000.0,49310794
1,39990000.0,48971165
2,11270000.0,11623935
3,11950000.0,12933913
4,53000000.0,104858087
...,...,...
2324,15830630.0,15551844
2325,30000000.0,17887773
2326,19990000.0,17892658
2327,7170000.0,6842541


In [None]:
final_labels = model.predict(X1_test)
final_labels = final_labels.astype(float)
final_labels

array([22145534.57367854, 12735162.37680138,  6059337.51226902, ...,
        9436800.82150435,  8494396.91410112,  5042042.45122851])

In [None]:
y_train_hat = model.predict(X_train)
y_valid_hat = model.predict(X_val)
#print(f'Train RMSLE: {root_mean_squared_log_error(y_true=y_train, y_pred=y_train_hat) :.4f}')
print(f'Valid RMSLE: {root_mean_squared_log_error(y_true=y_val, y_pred=y_valid_hat) :.4f}')

Valid RMSLE: 0.1653


In [None]:
final_result = pd.DataFrame({'id': data_test['id'], 'price': final_labels})

In [None]:
final_result.to_csv('submission.csv', index=False)

In [None]:
submission = pd.read_csv("submission.csv")
submission

Unnamed: 0,id,price
0,23285,2.214553e+07
1,23286,1.273516e+07
2,23287,6.059338e+06
3,23288,1.336547e+07
4,23289,5.072882e+06
...,...,...
9932,33217,3.032930e+07
9933,33218,2.187822e+07
9934,33219,9.436801e+06
9935,33220,8.494397e+06


In [None]:
submission = pd.DataFrame()
submission['id'] = data_test.id
submission['price_prediction'] = final_labels # Predict on non-nan entries
 # Fill missing entries with mean predictor
print(f'Generated {len(submission)} predictions')

# Export submission to csv with headers
#submission.to_csv('sample_submission.csv', index=False)

submission.to_csv('sample_submission.csv', index=False)

# Look at submitted csv
print('\nLine count of submission')
!wc -l sample_submission.csv

print('\nFirst 5 rows of submission')
!head -n 5 sample_submission.csv

Generated 9937 predictions

Line count of submission
9938 sample_submission.csv

First 5 rows of submission
id,price_prediction
23285,22145534.573678542
23286,12735162.37680138
23287,6059337.512269016
23288,13365466.83050742


In [None]:
import xgboost
classifier=xgboost.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=99c1e030-46bc-40fa-835c-0661eae0d488' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>