In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_log_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import preprocessing
from sklearn.preprocessing import  StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from xgboost import XGBRegressor
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import eli5
import xgboost

In [2]:
train_df = pd.read_csv('train_bFQbE3f/train.csv')
test_df = pd.read_csv('test_9K3DBWQ.csv')

In [3]:
id_list = test_df['id'].values.tolist()

In [4]:
train_df['cc_cons'].describe()

count     32820.000000
mean       6824.631840
std       12459.140593
min           0.000000
25%        1285.000000
50%        3141.000000
75%        7349.250000
max      408630.000000
Name: cc_cons, dtype: float64

In [5]:
#skewness and kurtosis
print("Skewness: %f" % train_df['cc_cons'].skew())
print("Kurtosis: %f" % train_df['cc_cons'].kurt())

Skewness: 7.518036
Kurtosis: 109.136639


In [6]:
df_na = (train_df.isnull().sum() / len(train_df)) * 100
df_na = df_na.sort_values(ascending=False)[:43]
missing_data = pd.DataFrame({'Missing Ratio' : df_na})
#missing_data.head(50)

In [7]:
#Impute null values
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

In [8]:
#correlation matrix
# features = ['debit_count_may',  'debit_count_apr' , 'debit_count_jun' , 'credit_count_apr', 'credit_count_may',
#             'credit_count_jun', 'cc_count_jun' , 'cc_count_may', 'cc_count_apr', 'dc_count_apr', 'dc_count_may', 'dc_count_jun',
#             'age', 'cc_cons_may', 'cc_cons_apr', 'cc_cons_jun', 'cc_cons', 'emi_active']
# corrmat = train_df.loc[:,features].corr()
# f, ax = plt.subplots(figsize=(12, 9))
#sns.heatmap(corrmat, vmax=.8, square=True);

In [9]:
#Fix outliers in train data
train_df = train_df.drop(train_df[train_df['id'] == 10083].index)
train_df = train_df.drop(train_df[train_df['id'] == 23211].index)
train_df = train_df.drop(train_df[train_df['id'] == 16198].index)
train_df = train_df.drop(train_df[train_df['age'] > 200].index)

In [10]:
print(len(train_df))
print(len(test_df))

32805
14067


In [11]:
#Define function for creating age bins
def label_age(row):
    age = row['age']
    if age>0 and age <30:
        return '1'
    if age >=30 and age<60 :
        return '2'
    if age >=60 :
        return '3'
    return 'Other'

In [12]:
#Derive features from columns for train data
train_df['cc_cons_avg'] = train_df[['cc_cons_apr', 'cc_cons_may', 'cc_cons_jun']].mean(axis=1)
train_df['cc_cons_var'] = train_df[['cc_cons_apr', 'cc_cons_may', 'cc_cons_jun']].var(axis=1)
train_df['age_group'] = train_df.apply(lambda row: label_age(row), axis=1)

#Derive features from columns for test data
test_df['cc_cons_avg'] = test_df[['cc_cons_apr', 'cc_cons_may', 'cc_cons_jun']].mean(axis=1)
test_df['age_group'] = test_df.apply(lambda row: label_age(row), axis=1)
test_df['cc_cons_var'] = test_df[['cc_cons_apr', 'cc_cons_may', 'cc_cons_jun']].var(axis=1)

In [13]:
#get dummies
train_df = pd.get_dummies(train_df, columns=['gender', 'age_group'], prefix = ['gender', 'age_group'])
test_df = pd.get_dummies(test_df, columns=['gender', 'age_group'], prefix = ['gender', 'age_group'])

In [14]:
features = ['cc_cons_var','age_group_1','age_group_2', 'age_group_3', 'gender_F', 'gender_M','card_lim', 'emi_active', 'cc_cons_apr', 'cc_cons_may', 'cc_cons_jun']
features_log_transform = ['cc_cons_var','emi_active', 'cc_cons_apr', 'cc_cons_may', 'cc_cons_jun', 'card_lim']
features_scale = ['cc_cons_var','emi_active', 'cc_cons_apr', 'cc_cons_may', 'cc_cons_jun', 'card_lim']

In [15]:
#Transform features
train_df[features_log_transform] = np.log1p(train_df[features_log_transform])
train_df['cc_cons'] = np.log1p(train_df['cc_cons'])
test_df[features_log_transform] = np.log1p(test_df[features_log_transform])

In [16]:
#Scale features
scaler = StandardScaler()
train_df[features_scale] = scaler.fit_transform(train_df[features_scale])
test_df[features_scale] = scaler.transform(test_df[features_scale])

In [17]:
#Create input features and target variable
X = train_df[features]
X.reset_index(drop=True, inplace=True)
y = train_df['cc_cons']
y.reset_index(drop=True, inplace=True)

In [18]:
X_test = test_df[features]
X_test.reset_index(drop=True, inplace=True)

In [19]:
# params = {'min_child_weight':[4,5,6], 'gamma':[i/10.0 for i in range(3,6)],  'subsample':[i/10.0 for i in range(6,11)],
# 'colsample_bytree':[i/10.0 for i in range(6,11)], 'max_depth': [2,3,4]}
# xgb = XGBRegressor(nthread=-1) 
# grid = GridSearchCV(xgb, params)
# grid.fit(X_train, y_train)

In [20]:
# print("best_params_",grid.best_params_)
# best_params_ {'colsample_bytree': 0.7, 'gamma': 0.4, 'max_depth': 3, 'min_child_weight': 4, 'subsample': 0.8}

In [21]:
X_train, X_cv, y_train, y_cv = train_test_split(X, y, test_size=0.2, random_state=52)

In [22]:
#Train using XGBoost using best parameters obtained from GridSearch
xgb = xgboost.XGBRegressor(colsample_bytree=0.7, gamma=0.4,
       importance_type='gain', learning_rate=0.1, max_depth=3, min_child_weight=4, subsample=0.8, objective='reg:squarederror')
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_cv)
eli5.show_weights(xgb, feature_names=list(X.columns),
                     show_feature_values=True, top=45)

Weight,Feature
0.2037,card_lim
0.1916,cc_cons_may
0.1537,cc_cons_jun
0.1181,cc_cons_apr
0.08,cc_cons_var
0.0783,gender_M
0.0696,gender_F
0.0353,emi_active
0.0288,age_group_3
0.0283,age_group_2


In [23]:
print('score : {}'.format(100*mean_squared_log_error(y_cv, y_pred_xgb)))

score : 2.0819577329508467


In [24]:
# from sklearn.model_selection import learning_curve
# train_sizes, train_scores, valid_scores = learning_curve(xgb, X, y, train_sizes=[0.6, 0.7, 0.8], cv=3, scoring='neg_mean_squared_log_error')

In [25]:
# train_scores_mean = np.mean(train_scores, axis=1)
# train_scores_std = np.std(train_scores, axis=1)
# test_scores_mean = np.mean(valid_scores, axis=1)
# test_scores_std = np.std(valid_scores, axis=1)
# plt.grid()

# plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
#                  train_scores_mean + train_scores_std, alpha=0.1,
#                  color="r")
# plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
#                  test_scores_mean + test_scores_std, alpha=0.1, color="g")
# plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
#          label="Training score")
# plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
#          label="Cross-validation score")

# plt.legend(loc="best")

In [26]:
#Predict on test data
xgb = xgboost.XGBRegressor(colsample_bytree=0.7, gamma=0.4,
       importance_type='gain', learning_rate=0.1, max_depth=3, min_child_weight=4, subsample=0.8, objective='reg:squarederror')
xgb.fit(X, y)
y_test = xgb.predict(X_test)
cc_cons_list = [np.expm1(item) for item in y_test.tolist()]
submission_data = {'id': id_list, 'cc_cons': cc_cons_list}
submission = pd.DataFrame(data=submission_data)
submission.to_csv('submission.csv', index=False)