In [None]:
import json
import pandas as pd
import numpy as np
import math as math
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn.model_selection as model_selection
import sklearn.tree as tree
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
import time
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor


In [None]:
# Start writing code here...
np.random.seed(123)
sns.set_style('darkgrid')
pd.set_option('display.max_colwidth', None)

def describe_column(meta):
    """
    Utility function for describing a dataset column (see below for usage)
    """
    def f(x):
        d = pd.Series(name=x.name, dtype=object)
        m = next(m for m in meta if m['name'] == x.name)
        d['Type'] = m['type']
        d['#NaN'] = x.isna().sum()
        d['Description'] = m['desc']
        if m['type'] == 'categorical':
            counts = x.dropna().map(dict(enumerate(m['cats']))).value_counts().sort_index()
            d['Statistics'] = ', '.join(f'{c}({n})' for c, n in counts.items())
        elif m['type'] == 'real' or m['type'] == 'integer':
            stats = x.dropna().agg(['mean', 'std', 'min', 'max'])
            d['Statistics'] = ', '.join(f'{s}={v :.1f}' for s, v in stats.items())
        elif m['type'] == 'boolean':
            counts = x.dropna().astype(bool).value_counts().sort_index()
            d['Statistics'] = ', '.join(f'{c}({n})' for c, n in counts.items())
        else:
            d['Statistics'] = f'#unique={x.nunique()}'
        return d
    return f


def describe_data(data, meta):
    desc = data.apply(describe_column(meta)).T
    desc = desc.style.set_properties(**{'text-align': 'left'})
    desc = desc.set_table_styles([ dict(selector='th', props=[('text-align', 'left')])])
    return desc
    

apartments = pd.read_csv('apartments_train.csv')
#print(f'Loaded {len(apartments)} apartments')
with open('apartments_meta.json') as f:
    apartments_meta = json.load(f)
(describe_data(apartments, apartments_meta))

buildings = pd.read_csv('buildings_train.csv')
#print(f'Loaded {len(buildings)} buildings')
with open('buildings_meta.json') as f:
    buildings_meta = json.load(f)
buildings.head()
describe_data(buildings, buildings_meta)

#print(f'All apartments have an associated building: {apartments.building_id.isin(buildings.id).all()}')
data = pd.merge(apartments, buildings.set_index('id'), how='left', left_on='building_id', right_index=True)
#print(data.head())


def root_mean_squared_log_error(y_true, y_pred):
    # Alternatively: sklearn.metrics.mean_squared_log_error(y_true, y_pred) ** 0.5
    assert (y_true >= 0).all()
    assert (y_pred >= 0).all()
    log_error = np.log1p(y_pred) - np.log1p(y_true)  # Note: log1p(x) = log(1 + x)
    return np.mean(log_error ** 2) ** 0.5


In [None]:
def clean_data(data):
    nan_replacements = [
        # ('floor', 0), #no nans
        # ('rooms', 1), #no nans
        ('ceiling', data['ceiling'].mean()),
        #('bathrooms_shared',),
        #('bathrooms_private',),
        ('condition', 3),
        ('phones', 1),
        ('new', 0),
        #('contructed',),
        ('material', 2),
        ('elevator_without', 0),
        ('elevator_passenger', 1),
        ('elevator_service', 0),
        ('parking', 1),
        ('garbage_chute', 0),
        ('heating', 0),
        ('balconies', 0),
        ('loggias', 0),
        ('seller', 2),
        ('district', 2), #idk change
    ]

    data = data.drop(columns=["id", "area_total", "building_id",
        "street", "address", "stories", 'bathrooms_shared', 'bathrooms_private',
        'constructed', 'windows_street', 'windows_court', 'layout', 'area_kitchen', 'area_living' ])

    for feature, value in nan_replacements:
        data[feature] = data[feature].fillna(value)

    return data

data_ = clean_data(data)
print(data_.head())
data_train, data_valid = model_selection.train_test_split(data_, test_size=0.33, stratify=np.log(data_.price).round())


   seller       price  floor  rooms   ceiling  balconies  loggias  condition  \
0     3.0   7139520.0    2.0    2.0  2.650000        0.0      0.0        3.0   
1     2.0  10500000.0   18.0    3.0  3.268374        1.0      0.0        3.0   
2     3.0   9019650.0   12.0    3.0  2.650000        0.0      0.0        3.0   
3     2.0  10500000.0   18.0    3.0  3.268374        0.0      1.0        2.0   
4     2.0  13900000.0    7.0    2.0  2.900000        0.0      1.0        3.0   

   phones  new   latitude  longitude  district  material  elevator_without  \
0     1.0  1.0  55.544046  37.478055      11.0       3.0               0.0   
1     1.0  0.0  55.861282  37.666647       2.0       3.0               0.0   
2     1.0  1.0  55.663299  37.515335       6.0       3.0               0.0   
3     1.0  0.0  55.861282  37.666647       2.0       3.0               0.0   
4     1.0  0.0  55.590785  37.451438      11.0       2.0               0.0   

   elevator_passenger  elevator_service  parking  

In [None]:
"""
#data.describe()
X_train = data_train.drop(columns=["id", "building_id", "street", "address"])
X_valid = data_valid.drop(columns=["id", "building_id", "street", "address"])
y_train = data_train.price
y_valid = data_valid.price
model1 = DecisionTreeRegressor().fit(X_train, y_train)
y_train_hat = model1.predict(X_train)
y_valid_hat = model1.predict(X_valid)
print(f'Train RMSLE: {root_mean_squared_log_error(y_true=y_train, y_pred=y_train_hat) :.4f}')
print(f'Valid RMSLE: {root_mean_squared_log_error(y_true=y_valid, y_pred=y_valid_hat) :.4f}')
"""


'\n#data.describe()\nX_train = data_train.drop(columns=["id", "building_id", "street", "address"])\nX_valid = data_valid.drop(columns=["id", "building_id", "street", "address"])\ny_train = data_train.price\ny_valid = data_valid.price\nmodel1 = DecisionTreeRegressor().fit(X_train, y_train)\ny_train_hat = model1.predict(X_train)\ny_valid_hat = model1.predict(X_valid)\nprint(f\'Train RMSLE: {root_mean_squared_log_error(y_true=y_train, y_pred=y_train_hat) :.4f}\')\nprint(f\'Valid RMSLE: {root_mean_squared_log_error(y_true=y_valid, y_pred=y_valid_hat) :.4f}\')\n'

In [None]:
#Random forest
from sklearn.ensemble import RandomForestRegressor
X_train = data_train.drop(['id', 'price'], axis=1)
X_valid = data_valid.drop(['id', 'price'], axis=1)
print(len(X_train))
print(len(X_valid))
y_train = data_train.price
y_valid = data_valid.price
#model1 = LogisticRegression(random_state = 0).fit(X_train, y_train)
#y_train_hat = model1.predict(X_train)
#y_valid_hat = model1.predict(X_valid)
#print(f'Train RMSLE: {root_mean_squared_log_error(y_true=y_train, y_pred=y_train_hat) :.4f}')
#print(f'Valid RMSLE: {root_mean_squared_log_error(y_true=y_train, y_pred=y_train_hat) :.4f}')

model = RandomForestRegressor(n_estimators=50).fit(X_train, y_train)

y_train_hat = model.predict(X_train)
y_valid_hat = model.predict(X_valid)
y_tests = model.predict()
print(f'Train RMSLE: {root_mean_squared_log_error(y_true=y_train, y_pred=y_train_hat) :.4f}')
print(f'Valid RMSLE: {root_mean_squared_log_error(y_true=y_valid, y_pred=y_valid_hat) :.4f}')

15600
7685
Train RMSLE: 0.0031
Valid RMSLE: 0.0048


In [None]:
submission = pd.DataFrame()
submission['id'] = data_test.id
submission['price_prediction'] = final_labels # Predict on non-nan entries
 # Fill missing entries with mean predictor
print(f'Generated {len(submission)} predictions')

# Export submission to csv with headers
#submission.to_csv('sample_submission.csv', index=False)

submission.to_csv('sample_submission.csv', index=False)

# Look at submitted csv
print('\nLine count of submission')
!wc -l sample_submission.csv

print('\nFirst 5 rows of submission')
!head -n 5 sample_submission.csv

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=99c1e030-46bc-40fa-835c-0661eae0d488' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>