In [2]:
from matplotlib import pyplot as plt
from scipy import stats
import numpy as np
import pandas as pd
import seaborn as sns
%matplotlib inline

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

In [4]:
trdf = pd.read_csv('data/Train.csv', sep=',', index_col=0)
tedf = pd.read_csv('data/Test.csv', sep=',', index_col=0)

In [5]:
df = trdf.fillna(trdf.mean(axis=0))

In [6]:
X_train = df.iloc[:, :-1]
Y_train = df.iloc[:, -1]


In [48]:
class MyTransofrmer:
    def __init__(self, categ_indexes,num_indexes):
        self.categ_indexes = categ_indexes
        self.num_indexes = num_indexes

    def fit_transform(self, X):
        return self.transform(X)
    
    def transform(self, X):
        X = np.array(X)
        date = pd.DatetimeIndex(X[:, self.categ_indexes].flatten()).year.values.reshape(-1,1)
        nums = X[:, self.num_indexes]
        return np.concatenate((date, nums), axis=1)

In [49]:
categ_indexes = [0]
num_indexes = np.arange(1,X_train.shape[1])
feature_transformer = MyTransofrmer(categ_indexes,num_indexes)

In [50]:
N = 100000
x = feature_transformer.fit_transform(X_train)
x_train, x_test, y_train, y_test = train_test_split(x[:N, :], Y_train.values[:N], test_size=.2, random_state=123)

In [51]:
model = Pipeline([
    ('pre', StandardScaler()),
    ('forest', RandomForestRegressor(
        n_estimators=200, 
        max_features=x.shape[1]//3))
])

In [52]:
%%time
model.fit(x, Y_train.values)

Wall time: 30.8 s


Pipeline(steps=[('pre', StandardScaler()),
                ('forest',
                 RandomForestRegressor(max_features=7, n_estimators=150))])

In [53]:
preds = model.predict(x_test)
r2 = r2_score(y_test, preds)
mae = mean_absolute_error(y_test, preds)
print(f"r^2 = {r2}; mae = {mae}")

r^2 = 0.9473951368429007; mae = 566884.4730289682


In [54]:
X = tedf.fillna(tedf.mean(axis=0))

In [55]:
X_test = feature_transformer.transform(X)

In [59]:
test_preds = model.predict(X_test)
pd_test_preds = pd.DataFrame(test_preds)
pd_test_preds.head()

Unnamed: 0,0
0,3133553.0
1,2729753.0
2,5584553.0
3,2121393.0
4,3328387.0


In [67]:
pd_test_preds.index = tedf.index
pd_test_preds.columns = ['price']
pd_test_preds.head()

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
100000,3133553.0
100001,2729753.0
100002,5584553.0
100003,2121393.0
100004,3328387.0


In [68]:
pd_test_preds.to_csv('sampleSubmission.csv',sep=',')