## Import Data

In [1]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns

  import pandas.util.testing as tm


In [2]:
print(os.listdir('./data'))

['train_new.csv', 'test_new.csv', 'feature_x.csv']


## Feature Engineering

In [3]:
app_train = pd.read_csv('./data/train_new.csv')
train_labels = app_train['Y']
app_train.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X65,X66,X67,X68,X69,X70,X71,X72,Y,id
0,9.0,1458.0,17147.0,10.0,0.0,800.0,0.0,,0.0,679.0,...,7.0,581.0,2449.0,93.0,498.0,6.0,0.0,0.0,1,0
1,2.0,250.0,38.0,6.0,,10000.0,0.0,,1.0,12990.0,...,31.0,796.0,7.0,122.0,406.0,5.0,,,1,1
2,2.0,1054.0,178.0,1.0,0.0,1000.0,0.0,,1.0,18710.0,...,230.0,732.0,29.0,78.0,10.0,6.0,0.0,0.0,0,2
3,10.0,1398.0,679.0,7.0,0.0,10000.0,0.0,,1.0,19010.0,...,11.0,36.0,113.0,82.0,35.0,6.0,0.0,0.0,1,3
4,2.0,1095.0,305.0,11.0,0.0,10000.0,0.0,,2.0,16410.0,...,93.0,395.0,50.0,48.0,491.0,5.0,0.0,0.0,0,4


In [4]:
app_test = pd.read_csv('./data/test_new.csv')
final_test = app_test
app_test.head()

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,...,X64,X65,X66,X67,X68,X69,X70,X71,X72,id
0,3.0,853.0,208.0,15.0,0.0,1000.0,0.0,,2.0,0.0,...,4355.0,168.0,238.0,34.0,78.0,4083.0,6.0,0.0,0.0,50000
1,2.0,1637.0,84.0,22.0,10000000000.0,10000.0,0.0,3072000.0,4.0,14330.0,...,1974.0,476.0,766.0,14.0,60.0,1252.0,6.0,34571438.0,8333504000.0,50001
2,8.0,3120.0,1218.0,30.0,0.0,30000.0,0.0,,0.0,25350.0,...,-9054.0,293.0,135.0,203.0,64.0,231.0,6.0,0.0,0.0,50002
3,6.0,1938.0,48.0,14.0,,1000.0,0.0,,0.0,32126.0,...,2038.0,388.0,155.0,9.0,100.0,317.0,5.0,,,50003
4,,,,,,,,,,,...,,,,,,,,,,50004


In [5]:
# choose most important features for feature construction
new_corrs = []
columns = app_train.columns.values.tolist()
# Iterate through the columns 
for col in columns:
    # Calculate correlation with the target
    corr = app_train['Y'].corr(app_train[col])

    # Append the list as a tuple
    new_corrs.append((col, corr))
new_corrs = sorted(new_corrs, key = lambda x: abs(x[1]), reverse = True)
new_corrs[:10]

[('Y', 1.0),
 ('X27', -0.2441447948441628),
 ('X30', -0.1644313669527459),
 ('X42', -0.1515420651706445),
 ('X25', -0.14620849738853522),
 ('X22', 0.14520811328899136),
 ('X34', 0.1412437064881161),
 ('X16', -0.14030661126550095),
 ('X14', -0.1289838932933782),
 ('X69', -0.11988376089532285)]

In [6]:
from sklearn.impute import SimpleImputer as Imputer
from sklearn.preprocessing import PolynomialFeatures  
# do poly feature engineering using combination of 5 features
poly_features_train = app_train[['X27', 'X30', 'X42', 'X25', 'Y']]
poly_features_test = app_test[['X27', 'X30', 'X42', 'X25']]

poly_target = poly_features_train['Y']
poly_features_train = poly_features_train.drop(columns = ['Y'])

# fill NaN in the table
imputer = Imputer(strategy = 'median')
poly_features_train = imputer.fit_transform(poly_features_train)
poly_features_test = imputer.transform(poly_features_test)
                        
poly_transformer = PolynomialFeatures(degree = 8)

# Train the polynomial features
poly_transformer.fit(poly_features_train)

# Transform the features
poly_features_train = poly_transformer.transform(poly_features_train)
poly_features_test = poly_transformer.transform(poly_features_test)
print('Polynomial Features shape: ', poly_features_train.shape)

poly_transformer.get_feature_names(input_features = ['X27', 'X30', 'X42', 'X25'])[:15]

Polynomial Features shape:  (50000, 495)


['1',
 'X27',
 'X30',
 'X42',
 'X25',
 'X27^2',
 'X27 X30',
 'X27 X42',
 'X27 X25',
 'X30^2',
 'X30 X42',
 'X30 X25',
 'X42^2',
 'X42 X25',
 'X25^2']

In [7]:
poly_features_train = pd.DataFrame(poly_features_train, 
                             columns = poly_transformer.get_feature_names(['X27', 'X30', 'X42', 'X25']))
poly_features_test = pd.DataFrame(poly_features_test, 
                                  columns = poly_transformer.get_feature_names(['X27', 'X30', 'X42', 'X25']))

poly_features_train['Y'] = poly_target

# Merge polynomial features into training dataframe
poly_features_train['id'] = app_train['id']
app_train_poly = app_train.merge(poly_features_train, how = 'left')

# Merge polnomial features into testing dataframe
poly_features_test['id'] = app_test['id']
app_test_poly = app_test.merge(poly_features_test, how = 'left')

# Align the dataframes
app_train_poly, app_test_poly = app_train_poly.align(app_test_poly, join = 'inner', axis = 1)

# Print out the new shapes
print('Training data with polynomial features shape: ', app_train_poly.shape)
print('Testing data with polynomial features shape:  ', app_test_poly.shape)


Training data with polynomial features shape:  (50000, 564)
Testing data with polynomial features shape:   (10000, 564)


In [8]:
# choose most important features after feature construction
new_corrs = []
columns = app_train_poly.columns.values.tolist()
# Iterate through the columns 
for col in columns:
    # Calculate correlation with the target
    corr = poly_target.corr(app_train_poly[col])

    # Append the list as a tuple
    new_corrs.append((col, corr))
new_corrs = sorted(new_corrs, key = lambda x: abs(x[1]), reverse = False)
new_corrs[:20]

[('X10', 0.0004681548170859026),
 ('X64', -0.0005019570755814007),
 ('X56', -0.0009662858716957663),
 ('X35', -0.0010872158022036766),
 ('X31', 0.0011067876254034932),
 ('X36', 0.0013384845715644952),
 ('X18', 0.003370143152067094),
 ('X48', -0.0045215086031005),
 ('X72', -0.004945275360387715),
 ('X71', -0.0050328634531733646),
 ('id', -0.005236023054915896),
 ('X11', 0.005342887167242225),
 ('X51', -0.006143902052987893),
 ('X60', -0.006717669802731037),
 ('X23', -0.006749875178364244),
 ('X47', 0.007373489907406655),
 ('X1', -0.007396490969991991),
 ('X5', -0.008345795692111219),
 ('X6', -0.009835351281528345),
 ('X68', 0.010414693741793378)]

In [9]:
for i in range(100):
    app_train_poly = app_train_poly.drop(columns = [new_corrs[i][0]])
    app_test_poly = app_test_poly.drop(columns = [new_corrs[i][0]])

app_train = app_train_poly
app_test = app_test_poly
app_train.shape,app_test.shape

((50000, 464), (10000, 464))

In [10]:
from sklearn.impute import SimpleImputer as Imputer
from sklearn.preprocessing import MinMaxScaler

train = app_train
test  = app_test
scaler = MinMaxScaler(feature_range = (0, 1))
imputer = Imputer(strategy = 'median')
imputer.fit(train)
train = imputer.transform(train)
test = imputer.transform(test)
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)

In [11]:
train = np.array(train)
test  = np.array(test)
train_labels = np.array(train_labels)
print(train.shape)
print(test.shape)

(50000, 464)
(10000, 464)


## Model Part

In [12]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

def evaluate_AUC(y_pred,y_true):
    print(roc_auc_score(y_true,y_pred))

def get_train_test_dataset(total_train,total_train_labels):
    X_train, X_test, y_train, y_test = train_test_split(total_train, total_train_labels, test_size=0.3, random_state=31)
    return X_train, X_test, y_train, y_test

In [13]:
X_train, X_test, y_train, y_test = get_train_test_dataset(train,train_labels)

In [14]:
import lightgbm as lgb
param = dict()
param['objective'] = 'binary'
param['boosting_type'] = 'gbdt'
param['metric'] = 'auc'
param['verbose'] = 0
param['learning_rate'] = 0.1
param['max_depth'] = -1
param['feature_fraction'] = 0.8
param['bagging_fraction'] = 0.8
param['bagging_freq'] = 1
param['num_leaves'] = 15
param['min_data_in_leaf'] = 64
param['is_unbalance'] = False
param['verbose'] = -1

lgb_train = lgb.Dataset(data=X_train,
                        label=y_train,
                        )
lgb_test = lgb.Dataset(data=X_test,
                       label=y_test,
                       )
model = lgb.train(param,
                  lgb_train,
                  early_stopping_rounds=10,
                  num_boost_round=1000,
                  valid_sets=[lgb_train, lgb_test],
                  verbose_eval=1)
y_pred = model.predict(X_test)
print('lightgbm  train  auc:',roc_auc_score(y_test,y_pred))

[1]	training's auc: 0.676129	valid_1's auc: 0.658628
Training until validation scores don't improve for 10 rounds
[2]	training's auc: 0.69063	valid_1's auc: 0.667332
[3]	training's auc: 0.695916	valid_1's auc: 0.672061
[4]	training's auc: 0.699986	valid_1's auc: 0.674542
[5]	training's auc: 0.703838	valid_1's auc: 0.677911
[6]	training's auc: 0.705333	valid_1's auc: 0.679055
[7]	training's auc: 0.706751	valid_1's auc: 0.679477
[8]	training's auc: 0.708203	valid_1's auc: 0.681024
[9]	training's auc: 0.709686	valid_1's auc: 0.682295
[10]	training's auc: 0.710587	valid_1's auc: 0.683096
[11]	training's auc: 0.712287	valid_1's auc: 0.684251
[12]	training's auc: 0.713648	valid_1's auc: 0.685594
[13]	training's auc: 0.715225	valid_1's auc: 0.686699
[14]	training's auc: 0.717338	valid_1's auc: 0.68852
[15]	training's auc: 0.718598	valid_1's auc: 0.689471
[16]	training's auc: 0.720085	valid_1's auc: 0.690117
[17]	training's auc: 0.721395	valid_1's auc: 0.691375
[18]	training's auc: 0.722539	va

[155]	training's auc: 0.800305	valid_1's auc: 0.72456
[156]	training's auc: 0.800593	valid_1's auc: 0.724649
[157]	training's auc: 0.800893	valid_1's auc: 0.724637
[158]	training's auc: 0.801226	valid_1's auc: 0.724615
[159]	training's auc: 0.8014	valid_1's auc: 0.724526
[160]	training's auc: 0.801612	valid_1's auc: 0.724511
[161]	training's auc: 0.802142	valid_1's auc: 0.724308
[162]	training's auc: 0.802432	valid_1's auc: 0.724306
[163]	training's auc: 0.80267	valid_1's auc: 0.724327
[164]	training's auc: 0.803117	valid_1's auc: 0.724537
[165]	training's auc: 0.8034	valid_1's auc: 0.7246
[166]	training's auc: 0.803743	valid_1's auc: 0.724534
Early stopping, best iteration is:
[156]	training's auc: 0.800593	valid_1's auc: 0.724649
lightgbm  train  auc: 0.7246489160243519


In [15]:
# GBDT model
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier(learning_rate=0.3,n_estimators=40,max_depth=4)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
print('gbdt  train  auc:',roc_auc_score(y_test,y_pred))

gbdt  train  auc: 0.7148537897162462


In [16]:
# LR model
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(C = 0.1)
model.fit(X_train, y_train)
y_pred = model.predict_proba(X_test)[:, 1]
print('logistic regression  train  auc:',roc_auc_score(y_test,y_pred))

logistic regression  train  auc: 0.6800681508856634


In [17]:
def sigmoid(x):
    return 1.0 / (1 + np.exp(-x))

def func_logisticregression(X, y):
    '''
    Classification algorithm.
    Input:  X: Training sample features, P-by-N
            y: Training sample labels, 1-by-N
    Output: w: learned logistic regression parameters, (P+1)-by-1
    '''
    P, N = X.shape
    w = np.ones((P+1, 1))
    # generate matrix
    iter_num = 400
    alpha = 0.001
    X = np.row_stack((X,np.ones((1, N))))
    for iter in range(iter_num):
        for i in range(N):
            x_i = X[:, i].reshape(P+1, 1)
            sig = sigmoid(np.dot(w.T,x_i))
            error = y[i] - sig
            w +=  alpha * x_i * error
    return w

# w0 + w1*y1 + w2*y2 + w464*y464 = y
w = func_logisticregression(X_train.T,y_train.T)
test_label = np.ones((15000,1))
test_data = np.column_stack((X_test,test_label)).T
y_pred = np.sign(np.dot(w.T, test_data)).reshape(-1,1)

print('logistic regression  train  auc:',roc_auc_score(y_test,y_pred))


logistic regression  train  auc: 0.601178235428526


In [None]:
# xgboost
from xgboost import XGBClassifier
model = GradientBoostingClassifier(learning_rate=0.3,n_estimators=50,max_depth=5)
model.fit(X_train,y_train)
y_pred = model.predict_proba(X_test)[:, 1]
print('xgboost regression  train  auc:',roc_auc_score(y_test,y_pred))

In [None]:
final_predict = final_test.loc[:,final_test.columns =='id']
y_evaluate_label = model.predict_proba(test)[:, 1]
final_predict['Y'] = y_evaluate_label
final_predict.head()

In [None]:
final_predict.to_csv("3180102099_pre.csv",index=False)