In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import math
import copy

from datetime import datetime

from sklearn.preprocessing import minmax_scale
from sklearn.metrics import mean_absolute_error

from sklearn.feature_selection import RFE

from sklearn.svm import SVR
from sklearn.ensemble.gradient_boosting import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

%matplotlib inline
import matplotlib.pyplot as plt

print(os.listdir("../input"))

In [None]:
train = pd.read_csv("../input/Train.csv")
test = pd.read_csv("../input/Test.csv")

In [None]:
train.head()

In [None]:
train.shape

In [None]:
plt.figure()
plt.hist(train['price'], bins = 1000)

In [None]:
train['price'].value_counts()

In [None]:
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
train['date'] = train['date'].map(lambda x: int((datetime.now() - datetime.strptime(str(x), '%Y-%m-%d')).total_seconds()))
test['date'] = test['date'].map(lambda x: (datetime.now() - datetime.strptime(str(x), '%Y-%m-%d')).total_seconds())

In [None]:
train['build_tech'] = train['build_tech'].fillna(-1)
train['g_lift'] = train['g_lift'].fillna(-1)
train['metro_dist'] = train['metro_dist'].fillna(-1)
test['build_tech'] = test['build_tech'].fillna(-1)
test['g_lift'] = test['g_lift'].fillna(-1)
test['metro_dist'] = test['metro_dist'].fillna(-1)
train.head()

In [None]:
plt.figure(figsize=(10, 150))
ind = 1
for i in train.columns:
    if i != 'price' and i != 'id':
        plt.subplot(25, 1, ind)
        plt.hist(train[i], bins=1000)
        plt.title(i)
        ind += 1
plt.tight_layout()

In [None]:
train.describe()

In [None]:
columns = list(set(train.columns.values) - set(['price','id']))
print(columns)
X = train[columns].values
X_t = test[columns].values
y = train['price']
print(y.head())
print(X.shape, X_t.shape)

In [None]:
print(train['kw12'].value_counts())

In [None]:
C = train[columns]

In [None]:
for i in columns:
    #print(C[i].value_counts().max())
    if (C[i].value_counts().max() > 60000):
        print(i, C[i].unique())
    #print(i,C[i].unique(),'\n',C[i].value_counts().max())

In [None]:
for i in columns:
    if (C[i].value_counts().max() > 90000):
        print(i, C[i].unique())

In [None]:
X = minmax_scale(X=X)
X_t = minmax_scale(X=X_t)

In [None]:
def calc(model, data, price = None):
    X_tmp = minmax_scale(data)
    kf = KFold(n_splits=7, shuffle=True, random_state=7)
    result = []
    y_tmp = price
    if price is None:
        y_tmp = y
    for train_index, test_index in kf.split(X_tmp):
        X_train, X_test = X_tmp[train_index], X_tmp[test_index]
        y_train, y_test = y_tmp[train_index], y_tmp[test_index]
        tmp_model = copy.copy(model)
        tmp_model.fit(X_train, y_train)
        y_ans = tmp_model.predict(X_test)
        result.append(mean_absolute_error(y_test, y_ans))
    result = np.array(result)
    return result.mean()

In [None]:
def find_feat(_model, _columns, _data, _price = None):
    _ans_col = []
    _col = copy.copy(_columns)
    quality = float('inf')
    while True:
        tmp_q = -1
        col = ''
        for i in _col:
            tmp_col = copy.copy(_ans_col)
            tmp_col.append(i)
            tmp_data = _data[tmp_col]
            clc = calc(_model, tmp_data, _price)
            if tmp_q == -1:
                tmp_q = clc
                col = i
            else:
                if clc < tmp_q:
                    tmp_q = clc
                    col = i
        if tmp_q < quality:
            quality = tmp_q
            _ans_col.append(col)
        else:
            break
        _col = list(set(_col) - set(_ans_col))
    return quality, _ans_col

In [None]:
#tree_quality, tree_columns = find_feat(RandomForestRegressor(random_state=7,n_estimators=10), columns, train)
#boost_quality, boost_columns = find_feat(GradientBoostingRegressor(random_state=7,loss='lad'), columns, train)
#regr_quality, regr_columns = find_feat(LinearRegression(), columns, train)

In [None]:
#print(tree_quality, tree_columns)
#print(boost_quality, boost_columns)
#print(regr_quality, regr_columns)

In [None]:
tree_quality = 885688.3352337575 
tree_columns = ['area', 'street_id', 'rooms', 'kw11']

In [None]:
#boost_quality = 1551065.1552422452
#boost_columns = ['area', 'street_id', 'rooms', 'n_photos', 'metro_dist', 'balcon', 'build_tech', 'kw3']

In [None]:
#regr_quality = 1690957.184157483
#regr_columns = ['area', 'rooms', 'n_photos', 'metro_dist', 'balcon', 'floor', 'build_tech', 'street_id', 'date', 'kw1', 'kw3', 'kw11', 'kw2', 'kw7', 'kw10', 'kw4', 'kw9', 'kw12', 'g_lift']

In [None]:
tree_data_train = pd.DataFrame(train[tree_columns])
tree_data_test = pd.DataFrame(test[tree_columns])
model_t = RandomForestRegressor(random_state=7, n_estimators=100)
model_t.fit(tree_data_train,y)
y_ans = model_t.predict(tree_data_test)

with open("TREE_ANS.csv", "w") as file:
    file.write("id,price\n")
    for ind,rows in test.iterrows():
        file.write(str(int(rows['id']))+','+str(y_ans[ind])+'\n')
        ind += 1

In [None]:
#samp = pd.read_csv("../input/SampleSubmission.csv")
#samp.head()

In [None]:
#ans = pd.read_csv("ANS.csv")
#ans.head()