In [21]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold


In [22]:
def get_df(filename, train=True):
    '''
    read data and pre-process into dataframe
    '''
    # read train data
    X = pd.read_csv(filename)

    # select numeric columns and fill NA values
    numeric_columns = X.select_dtypes(include=['number']).columns
    X[numeric_columns] = X[numeric_columns].fillna(method="ffill")

    ids, Y = X["Id"], []
    if train:
        Y = X["SalePrice"]
        X = X.drop(["Id", "SalePrice"], axis=1)
    else:
        X = X.drop(["Id"], axis=1)

    # encode categorical data
    to_encode = []
    for name, values in X.iteritems():
        try:
            c = float(values[0])
            if np.isnan(c):
                raise Exception
        except:
            to_encode.append(name)

    label_encode = X[to_encode].apply(LabelEncoder().fit_transform)
    X = pd.concat([X, label_encode], axis=1)

    # drop encoded columns
    X = X.drop(label_encode, axis=1)

    return X, Y, ids


In [23]:
X, y, _ = get_df("data/train.csv")

# f, ax = plt.subplots(figsize=(12, 9))
# sns.heatmap((pd.concat([X_train, y_train], axis=1).corr()))


In [26]:
model = xgb.XGBRegressor()

# do grid search and cross validation
xgb_hyperparams = {'n_estimators': [100, 150],
                   'learning_rate': [0.01, 0.1],
                   'max_depth': [3, 4]}

opt_model = GridSearchCV(model, xgb_hyperparams, n_jobs=4,
                   cv=StratifiedKFold(n_splits=4, shuffle=True),
                   scoring='neg_mean_squared_log_error')

opt_model.fit(X, y)

# print evaluation
print(opt_model.best_score_)
print(opt_model.best_params_)




-0.01815764352797398
{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 150}


In [27]:
# predicting test set and create submission file
with open("submission.txt", 'w') as outfile:
    outfile.write("Id,SalePrice\n")

    X_test, _, ids = get_df("data/test.csv", train=False)

    preds = opt_model.predict(X_test)
    for pred, id in zip(preds, ids):
        outfile.write(str(id) + ',' + str(pred) + '\n')
