# Sberbank Russian Housing Market

In [1]:
import pandas as pd
from pandas.io.json import build_table_schema
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import missingno as msno
import numpy as np
import scipy as sp
import os
import xgboost as xgb
import visualization as vis
from pandas.plotting import scatter_matrix
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

DIR_PATH = './'

In [2]:
pd.__path__

['/usr/local/lib/python2.7/site-packages/pandas']

In [3]:
## loading data as Pandas dataframes
train_raw = pd.read_csv(os.path.join(DIR_PATH, 'train.csv'), 
                        header='infer', 
                        index_col='id',
                        parse_dates=['timestamp'])
test_raw = pd.read_csv(os.path.join(DIR_PATH, 'test.csv'), 
                       header='infer', 
                       index_col='id',
                       parse_dates=['timestamp'])
macro = pd.read_csv(os.path.join(DIR_PATH, 'macro.csv'), 
                    header='infer')

### Data Understanding & Data Preparation

In [4]:
## data schema
train_schema = pd.DataFrame(build_table_schema(train_raw)['fields'])
macro_schema = pd.DataFrame(build_table_schema(macro)['fields'])

In [5]:
## start from property features first
property_features = ['timestamp', 'full_sq', 'life_sq', 'floor', 
                     'max_floor', 'material', 'build_year', 'num_room',
                     'kitch_sq', 'state', 'product_type', 'sub_area', 
                     'price_doc']

train = train_raw[property_features]
test = test_raw[property_features[:-1]]

## Cleaning the property features

## full_sq:

In [19]:
# <10 -> NA
train.loc[train['full_sq'] <10, 'full_sq'] = np.nan
test.loc[test['full_sq'] <10, 'full_sq'] = np.nan

# >300 -> NA
train.loc[train['full_sq'] > 300, 'full_sq'] = np.nan
test.loc[test['full_sq'] > 300, 'full_sq'] = np.nan

## life_sq:

In [34]:
# 0 or 1 -> NA
train.loc[(train['life_sq'] == 0) | (train['life_sq'] == 1), 'life_sq'] = np.nan
test.loc[(test['life_sq'] == 0) | (test['life_sq'] == 1), 'life_sq'] = np.nan

# > 300 -> NA
train.loc[train['life_sq'] > 300, 'life_sq'] = np.nan
test.loc[test['life_sq'] > 300, 'life_sq'] = np.nan

# full_sq < life_sq & life_sq > 100 -> NA
train.loc[(train['full_sq'] < train['life_sq']) & (train['life_sq'] > 100), 'life_sq'] = np.nan
test.loc[(test['full_sq'] < test['life_sq']) & (test['life_sq'] > 100), 'life_sq'] = np.nan

## max_floor:

In [33]:
# # 117 -> 17
train.loc[train['max_floor'] == 117, 'max_floor'] = 17
test.loc[test['max_floor'] == 117, 'max_floor'] = 17

# floor > max_floor -> NA
train.loc[train['floor'] > train['max_floor'], 'max_floor'] = np.nan
test.loc[test['floor'] > test['max_floor'], 'max_floor'] = np.nan

# >60 -> NA
train.loc[train['max_floor'] > 60, 'max_floor'] = np.nan
test.loc[test['max_floor'] > 60, 'max_floor'] = np.nan

# 0 -> NA
train.loc[train['max_floor'] == 60, 'max_floor'] = np.nan
test.loc[test['max_floor'] == 60, 'max_floor'] = np.nan

## material:

In [22]:
# 3 -> NA
col = 'material'
train.loc[train[col] == 3, col] = np.nan
test.loc[test[col] == 3, col] = np.nan

## build_year:

In [23]:
# 20052009 -> 2009 
train.loc[train['build_year'] == 20052009, 'build_year'] = 2009
test.loc[test['build_year'] == 20052009, 'build_year'] = 2009

#  4965 -> 1965
train.loc[train['build_year'] == 4965, 'build_year'] = 1965
test.loc[test['build_year'] == 4965, 'build_year'] = 1965

#  71 -> 1971
train.loc[train['build_year'] == 71, 'build_year'] = 1971
test.loc[test['build_year'] == 71, 'build_year'] = 1971

#  <1800 -> NA
train.loc[train['build_year'] < 1800, 'build_year'] = np.nan
test.loc[test['build_year'] < 1800, 'build_year'] = np.nan

## num_room:

In [24]:
# >9 -> NA
train.loc[train['num_room'] > 9, 'num_room'] = np.nan
test.loc[test['num_room'] > 9, 'num_room'] = np.nan

## kitch_sq:

In [32]:
# kitch_sq > full_sq & kitch_sq > 100 -> NA
train.loc[(train['kitch_sq'] > train['full_sq']) & (train['kitch_sq'] > 100), 'kitch_sq'] = np.nan
test.loc[(test['kitch_sq'] > test['full_sq']) & (test['kitch_sq'] > 100), 'kitch_sq'] = np.nan

## state:

In [25]:
## 33 -> 3
train.loc[train['state'] == 33, 'state'] = 3
test.loc[test['state'] == 33, 'state'] = 3