In [1]:
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.base import TransformerMixin
# import lightgbm as lgb
from sklearn.decomposition import PCA

In [2]:
train_path = 'train.csv'
test_path = 'test.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)

x_train = train_data.iloc[:,:-1]
y_train = train_data.iloc[:,-1]

x_test = test_data

FileNotFoundError: File b'train.csv' does not exist

In [None]:
class Data_Transform(TransformerMixin):
    def __init__(self, columns):
        self._columns = columns
        self._categories = None
    
    def transform(self, X):
        if isinstance(X, list):
            filtered = pd.DataFrame.from_records(X, columns=self._columns)
        else:
            filtered = X.loc[:,self._columns]
        
        filtered.loc[:,'timestamp'] = pd.to_datetime(filtered['timestamp']).astype(np.int64) // 10 ** 9
        filtered.loc[:,'sub_area'] = filtered['sub_area'].astype('category', categories=self._categories)
        one_hots = pd.get_dummies(filtered['sub_area'], prefix='sub_area', dummy_na=True)  
        filtered = pd.concat((one_hots, filtered), axis=1)
        filtered.drop('sub_area', axis=1, inplace=True)
        return filtered
    
    def fit(self, X, y=None, *_):
        self._categories = pd.Series(X['sub_area']).unique()
        return self


class XGRegr(XGBRegressor):
    def __init__(self, max_depth=3, learning_rate=0.1, n_estimators=100, silent=True,
                 objective='reg:linear', booster='gbtree', n_jobs=1, nthread=None,
                 gamma=0, min_child_weight=1, max_delta_step=0, subsample=1,
                 colsample_bytree=1, colsample_bylevel=1, reg_alpha=0,
                 reg_lambda=1, scale_pos_weight=1, base_score=0.5, random_state=0,
                 seed=None, missing=None, **kwargs):
        
        super().__init__(max_depth=max_depth, learning_rate=learning_rate, n_estimators=n_estimators, silent=silent,
                         objective=objective, booster=booster, n_jobs=n_jobs, nthread=nthread,
                         gamma=gamma, min_child_weight=min_child_weight, max_delta_step=max_delta_step, subsample=subsample,
                         colsample_bytree=colsample_bytree, colsample_bylevel=colsample_bylevel, reg_alpha=reg_alpha,
                         reg_lambda=reg_lambda, scale_pos_weight=scale_pos_weight, base_score=base_score, random_state=random_state,
                         seed=seed, missing=missing, **kwargs)
        
    def transform(self, *args):
        return self.predict(*args).reshape(-1,1)

    
def new_model():
    xgb = XGRegr(n_estimators=100, learning_rate=0.08, gamma=0, subsample=0.75,
                               colsample_bytree=1, max_depth=7)

    trans = Data_Transform(['full_sq', 'life_sq', 'floor', 'max_floor',
                    'num_room', 'build_year', 'timestamp', 'sub_area'])
    
    pipe = Pipeline([('trans', trans), ('xgb', xgb)])
    return pipe

model = new_model()
model.fit(x_train, y_train)


In [None]:
# there is no point in this structure ofcourse, its done just for usage of featureunion and pipeline,
# as the best result is gained by just removing all columns with nan data and using xgboost
# the Kaggle score for it is 0.338

# In this homework I haven't made the transformer function for data but will do the for flask homework.

In [None]:
# print(x_test[x_test.isnull()])

y_test = model.predict(x_test.fillna(0))

In [None]:
x_test['price_doc'] = y_test


In [None]:
x_test[['id', 'price_doc']].to_csv('soln.csv', index=False)
