In [46]:
import numpy as np
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
from scipy.stats import skew

from sklearn.preprocessing import scale



df_train = pd.read_csv("train.csv", parse_dates=['timestamp'])
df_test = pd.read_csv("test.csv", parse_dates=['timestamp'])
df_macro = pd.read_csv("macro.csv", parse_dates=['timestamp'])

df_train.head()

# =============================
# =============================
# cleanup
# brings error down a lot by removing extreme price per sqm
print(df_train.shape)
df_train.loc[df_train.full_sq == 0, 'full_sq'] = 30
df_train = df_train[df_train.price_doc/df_train.full_sq <= 600000]
df_train = df_train[df_train.price_doc/df_train.full_sq >= 10000]
print(df_train.shape)
# =============================
# =============================

(30471, 292)
(30404, 292)


In [21]:
df_train = df_train[(df_train['timestamp'] >= '2013-7-1')]

In [20]:
y_train = df_train['price_doc'].values
id_test = df_test['id']

df_train.drop(['id', 'price_doc'], axis=1, inplace=True)
df_test.drop(['id'], axis=1, inplace=True)

df_all = (df_train+df_test).join(df_macro)
num_train = len(df_train)    # save so can separate later
df_all = pd.concat([df_train, df_test])
df_all = df_all.join(df_macro, on='timestamp', rsuffix='_macro')
print(df_all.shape)


# ==============================
# Add month-year
month_year = (df_all.timestamp.dt.month + df_all.timestamp.dt.year * 100)
month_year_cnt_map = month_year.value_counts().to_dict()
df_all['month_year_cnt'] = month_year.map(month_year_cnt_map)

# Add week-year count
week_year = (df_all.timestamp.dt.weekofyear + df_all.timestamp.dt.year * 100)
week_year_cnt_map = week_year.value_counts().to_dict()
df_all['week_year_cnt'] = week_year.map(week_year_cnt_map)

# Add month and day-of-week
df_all['month'] = df_all.timestamp.dt.month
df_all['dow'] = df_all.timestamp.dt.dayofweek

# Other feature engineering
df_all['rel_floor'] = df_all['floor'] / df_all['max_floor'].astype(float)
df_all['rel_kitch_sq'] = df_all['kitch_sq'] / df_all['full_sq'].astype(float)

# Remove timestamp column (may overfit the model in train)
df_all.drop(['timestamp', 'timestamp_macro'], axis=1, inplace=True)







# Separate dtypes
df_numeric = df_all.select_dtypes(exclude=['object'])    
df_obj = df_all.select_dtypes(include=['object']).copy()


# Deal with categorical values
for c in df_obj:
    df_obj[c] = pd.factorize(df_obj[c])[0]    # how is this different from above????


# Log transform skewed numeric features 
get_col = df_numeric.dtypes[(df_numeric.dtypes == "int64") | (df_numeric.dtypes == "float64")].index
get_skews = df_numeric[get_col].apply(lambda x: skew(x.dropna()))
get_skews = get_skews[get_skews>0.5]
get_skews = get_skews.index
df_numeric[get_skews] = np.log1p(df_numeric[get_skews])       




# concatenate back    
df_values = pd.concat([df_numeric, df_obj], axis=1)









 


# # select continuous numeric columns
# numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
# num_df = df.select_dtypes(include=numerics)
# num_df.shape # 364 features


# # impute missing values with mean
# df = num_df.apply(lambda x: x.fillna(x.mean()),axis=0) # newdf is the numeric columns


# # scale the data frame
# df = (df - df.mean()) / (df.max() - df.min())


ValueError: columns overlap but no suffix specified: Index([u'timestamp'], dtype='object')

In [49]:

# Convert to numpy values
X_all = df_values.values
print(X_all.shape)


# Set the training and test sets
X_train = X_all[:num_train]
X_test = X_all[num_train:]


# Save the column names for features names
df_columns = df_values.columns


# Set the parameters
xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 1
}

# Train the set against the actual prices and then mak predictions
dtrain = xgb.DMatrix(X_train, y_train, feature_names=df_columns)
dtest = xgb.DMatrix(X_test, feature_names=df_columns)


# Tune XGB `num_boost_rounds`
#cv_result = xgb.cv(xgb_params, dtrain, num_boost_round=1000, early_stopping_rounds=20,
#    verbose_eval=True, show_stdv=False)
#cv_result[['train-rmse-mean', 'test-rmse-mean']].plot()
#num_boost_rounds = len(cv_result)

num_boost_round = 489


# Run the model
model = xgb.train(dict(xgb_params, silent=0), dtrain, num_boost_round=num_boost_round)


# Plot the feature importance
# fig, ax = plt.subplots(1, 1, figsize=(8, 16))
# xgb.plot_importance(model, max_num_features=50, height=0.5, ax=ax)


# Make the predictions
y_pred = model.predict(dtest)
y_pred = np.round(y_pred * 0.99)
df_sub = pd.DataFrame({'id': id_test, 'price_doc': y_pred})


# Save the csv
df_sub.to_csv('sub2.csv', index=False)

(29301L, 394L)


In [53]:
# Inspect results
df_sub_orig = pd.read_csv('sub_orig.csv')
print(df_sub.mean())
print(df_sub_orig.mean())

id           3.430450e+04
price_doc    7.423775e+06
dtype: float64
id           3.430450e+04
price_doc    7.362361e+06
dtype: float64
