In [None]:
import numpy as np
import pandas as pd

from sklearn import cross_validation
from sklearn import preprocessing
import xgboost as xgb

In [None]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')

In [None]:
train_df.describe()

In [None]:
%matplotlib inline

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
fig, (axis1) = plt.subplots(1,1,figsize=(15,5))

sns.countplot(x=train_df["Response"], order=[1,2,3,4,5,6,7,8], ax=axis1)

# Encode NonNumeric columns

In [None]:
for col in train_df.columns:
    if train_df[col].dtype == 'object':
        encoder = preprocessing.LabelEncoder()
        encoder.fit(list(train_df[col].values) + list(test_df[col].values))
        train_df[col] = encoder.transform(list(train_df[col].values))
        test_df[col] = encoder.transform(list(test_df[col].values))

In [None]:
train_df.head()

# Fill NA

In [None]:
for col in train_df.columns:
    if col == 'Response':
        continue
    if train_df[col].dtype == 'float64':
        train_df[col].fillna(train_df[col].mean(), inplace=True)
        test_df[col].fillna(test_df[col].mean(), inplace=True)
    else:
        train_df[col].fillna(train_df[col].median(), inplace=True)
        test_df[col].fillna(test_df[col].median(), inplace=True)

In [None]:
test_df.head()

In [None]:
train_features = train_df.drop(['Id', 'Response'], axis=1)
train_target = train_df['Response']
test_index = test_df['Id']
test_features = test_df.drop('Id', axis=1)

# Get Validation Dataset

In [None]:
val_index = np.random.randint(0, train_df.shape[0], 2000)
val_data = train_df.ix[val_index]
val_data.shape

In [None]:
val_target = val_data['Response'].values
val_features = val_data.drop(['Id', 'Response'], axis=1)

# Xgboost Params

In [None]:
dtrain = xgb.DMatrix(train_features, label=train_target)
dval  = xgb.DMatrix(val_features, label=val_target)

In [None]:
params = {
    'eta': 0.025,
    'objective':'count:poisson',
    'depth': 20,
    'eval_metric': 'rmse',
    'nthread':2,
    'subsample': 0.7,
    'colsample_bytree': 0.65,
    'min_child_weight': 3,
    'early_stopping_rounds': 10
}

num_round = 20000
watchlist = [(dtrain, 'train'), (dval, 'validation')]

gbm = xgb.train(params, dtrain, num_round, watchlist)

# Predict

In [None]:
dtest  = xgb.DMatrix(test_features)
preds = gbm.predict(dtest)

def output(x):
    if x < 1:
        return 1
    elif x > 8:
        return 8
    elif x == 3:# trick
        return 2
    else:
        return int(round(x))

result = [output(x) for x in preds]

In [None]:
submit_df = pd.DataFrame({'Id': test_index, 'Response': result})
submit_df.to_csv('prudential_xgboost_poissoncount.csv', index=False)