1. Loading packages

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import matplotlib.pyplot as plt
import missingno

from sklearn import model_selection, tree, preprocessing, metrics, linear_model
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import make_pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC

2. Loading dataset

In [None]:
%%time
train = pd.read_csv("../input/train.csv")
test = pd.read_csv("../input/test.csv")
train.head()

In [None]:
train.head()

In [None]:
train.tail()

3. Data Exploration and Preparation

In [None]:
print("train data consist of {} rows and {} columns".format(train.shape[0],train.shape[1]))
print("test data consist of {} rows and {} columns".format(test.shape[0],test.shape[1]))

3.1 Univariate analysis

3.1.1 Descriptive statistics

In [None]:
train.describe()

* The plotted distribution of means for the **train** set

In [None]:
variables = train.columns[2:]

In [None]:
plt.figure(figsize=(18,7))
sns.set(style="darkgrid")
sns.distplot(train[variables].mean(axis=0),
             color="magenta",
             kde=True,
             bins=80, 
             label='train')
plt.legend()
plt.show()


The plotted distribution of minimum and maximum values

In [None]:
plt.figure(figsize=(18,7))
sns.set(style="darkgrid")

sns.distplot(train[variables].min(axis=0),
             color="r",
             kde=True,
             bins=80, 
             label='minimum').set_title("Plotted distribution of maximum and minimum values on train set")

sns.distplot(train[variables].max(axis=0),
             color="g",
             kde=True,
             bins=80, 
             label='maximum')
plt.legend()
plt.show()

In [None]:
test.describe()

* The plotted distribution of means for the **test** set

In [None]:
plt.figure(figsize=(18,7))
sns.set(style="darkgrid")
sns.distplot(test[variables].mean(axis=0),
             color="purple",
             kde=True,
             bins=80, 
             label='train')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(18,7))
sns.set(style="darkgrid")

sns.distplot(test[variables].min(axis=0),
             color="orange",
             kde=True,
             bins=80, 
             label='minimum').set_title("Plotted distribution of maximum and minimum values on the test set")

sns.distplot(test[variables].max(axis=0),
             color="violet",
             kde=True,
             bins=80, 
             label='maximum')
plt.legend()
plt.show()

Comparison of the train and test datasets

* means

In [None]:
plt.figure(figsize=(18,7))
sns.set(style="darkgrid")

sns.distplot(train[variables].mean(axis=0),
             color="magenta",
             kde=True,
             bins=80, 
             label='train').set_title("Plotted distribution of mean values for test and train sets")

sns.distplot(test[variables].mean(axis=0),
             color="purple",
             kde=True,
             bins=80, 
             label='test')
plt.legend()
plt.show()

* Minimum values

In [None]:
plt.figure(figsize=(18,7))
sns.set(style="darkgrid")

sns.distplot(train[variables].min(axis=0),
             color="r",
             kde=True,
             bins=80, 
             label='train').set_title("Plotted distribution of minimum values for test and train sets")

sns.distplot(test[variables].min(axis=0),
             color="orange",
             kde=True,
             bins=80, 
             label='test')
plt.legend()
plt.show()

* Maximum values

In [None]:
plt.figure(figsize=(18,7))
sns.set(style="darkgrid")

sns.distplot(train[variables].max(axis=0),
             color="g",
             kde=True,
             bins=80, 
             label='train').set_title("Plotted distribution of maximum values for test and train sets")

sns.distplot(test[variables].max(axis=0),
             color="violet",
             kde=True,
             bins=80, 
             label='test')
plt.legend()
plt.show()

* Standart deviations

In [None]:
plt.figure(figsize=(18,7))
sns.set(style="darkgrid")

sns.distplot(train[variables].std(axis=0),
             color="#808000",
             kde=True,
             bins=80, 
             label='train').set_title("Plotted distribution of standart deviations for test and train sets")

sns.distplot(test[variables].std(axis=0),
             color="#CD5C5C",
             kde=True,
             bins=80, 
             label='test')
plt.legend()
plt.show()

* Outliers

In [None]:
sns.boxplot(train.iloc[:,2])

In [None]:
# Figure parameters
plt.rcParams['figure.figsize'] = (8, 6)
title_config = {'fontsize': 20, 'y': 1.05}

In [None]:
sns.countplot(train['target'])


Distribution of 1 is less than 1/7 of distribution of 0.

3.2 Bivariate analysis

In [None]:
%%time
correlation = train.corr()
sns.heatmap(correlation,cmap="YlGnBu")
plt.show()

3.3 Check for missing values 

In [None]:
%%time
missingno.matrix(train, figsize = (18,7))
plt.show()

Conclusion: It looks like the both datasets have no missing values!

4. Model selection

* Variables declaration

In [None]:
X_train = train[variables]
y_train = train["target"]

X_test = test[variables]

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import cross_val_score

def fit_ml_algo(algo, X_train, y_train, X_test, cv):
    
        
    pipeline = make_pipeline(QuantileTransformer(output_distribution='normal'), algo)
    model = pipeline.fit(X_train, y_train)
    
    fpr, tpr, thr = roc_curve(y_train, pipeline.predict_proba(X_train)[:,1])

    
    #model = algo.fit(X_train, y_train)
    acc = round(model.score(X_train, y_train) * 100, 2)
    
    auc_score = auc(fpr, tpr)

    auc_score_cv = cross_val_score(pipeline, X_train, y_train, scoring='roc_auc', cv=10).mean()
    
  
    return acc, auc_score,auc_score_cv, fpr, tpr

### Logistic Regression

In [None]:
%%time
import warnings
warnings.filterwarnings("ignore")
acc_log, auc_log, auc_log_cv, fpr_log, tpr_log  = fit_ml_algo(LogisticRegression(penalty="l2"), 
                                                                              X_train,
                                                                              y_train,
                                                                              X_test,
                                                                              cv=10)


print("Accuracy: {}".format(acc_log))
print("AUC score: {}".format(auc_log))
print("AUC score CV 10-Fold: {}".format(auc_log_cv))

### Gaussian Naive Bayes

In [None]:
%%time
acc_gaussian, auc_gaussian,auc_gaussian_cv, fpr_gaussian, tpr_gaussian = fit_ml_algo(GaussianNB(),
                                                                     X_train,
                                                                     y_train,
                                                                     X_test,
                                                                     10)

print("Accuracy: {}".format(acc_gaussian))
print("AUC score: {}".format(auc_gaussian))
print("AUC CV 10-Fold: {}".format(auc_gaussian_cv))

### KNN Classifier

In [None]:
%%time
import warnings
warnings.filterwarnings("ignore")

from sklearn.neighbors import KNeighborsClassifier

acc_knn, auc_knn, auc_knn_cv, fpr_knn, tpr_knn  = fit_ml_algo(KNeighborsClassifier(),
                                                              X_train,
                                                              y_train,
                                                              X_test,
                                                              cv=10)


print("Accuracy: {}".format(acc_knn))
print("AUC score: {}".format(auc_knn))
print("AUC score CV 10-Fold: {}".format(auc_knn_cv))

### Decision Tree Classfier

In [None]:
%%time
from sklearn.tree import DecisionTreeClassifier

acc_dt, auc_dt, auc_dt_cv, fpr_dt, tpr_dt  = fit_ml_algo(DecisionTreeClassifier(),
                                                              X_train,
                                                              y_train,
                                                              X_test,
                                                              cv=10)


print("Accuracy: {}".format(acc_dt))
print("AUC score: {}".format(auc_dt))
print("AUC score CV 10-Fold: {}".format(auc_dt_cv))

In [None]:
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic Plot for Selected Models')
plt.plot(fpr_log, tpr_log, label = "Logistic regression")
plt.plot(fpr_gaussian, tpr_gaussian, label= "Gaussian NB")
plt.plot(fpr_dt, tpr_dt, label= "Tree Classifier")
plt.legend()
plt.show()

Feature creation

1) data augmentation

In [None]:
def augment(x,y,t=5):
    xs,xn = [],[]
    for i in range(t):
        mask = y>0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xs.append(x1)

    for i in range(t//2):
        mask = y==0
        x1 = x[mask].copy()
        ids = np.arange(x1.shape[0])
        for c in range(x1.shape[1]):
            np.random.shuffle(ids)
            x1[:,c] = x1[ids][:,c]
        xn.append(x1)

    xs = np.vstack(xs)
    xn = np.vstack(xn)
    ys = np.ones(xs.shape[0])
    yn = np.zeros(xn.shape[0])
    x = np.vstack([x,xs,xn])
    y = np.concatenate([y,ys,yn])
    return x,y

In [None]:
%%time
X_tr, y_tr = augment(X_train.values, y_train.values)

In [None]:
X_tr = pd.DataFrame(X_tr)
y_tr = pd.DataFrame(y_tr)
X_tr.tail()

* Gaussian Naive Bayes with data augmentation

In [None]:
%%time
train_pred_gaussian, acc_gaussian, acc_cv_gaussian, auc_gaussian = fit_ml_algo(GaussianNB(), 
                                                                      X_tr, 
                                                                      y_tr,
                                                                      X_test,   
                                                                           10)

print("Accuracy: {}".format(acc_gaussian))
print("Accuracy CV 10-Fold: {}".format(acc_cv_gaussian))
print("AUC score: {}".format(auc_gaussian))

* Logistics Regression with augmentation

In [None]:
%%time
train_pred_log, acc_log, acc_cv_log, auc_log  = fit_ml_algo(LogisticRegression(penalty="l2"), 
                                                  X_train_new, 
                                                  y_train_new,
                                                  X_test,
                                                  cv=10)
print("Accuracy: {}".format(acc_log))
print("Accuracy CV 10-Fold: {}".format(acc_cv_log))
print("AUC score: {}".format(auc_log))

2) Feature creation

In [None]:
%%time
idx = features = train.columns.values[2:202]
for df in [test, train]:
    df['sum'] = df[idx].sum(axis=1)  
    df['min'] = df[idx].min(axis=1)
    df['max'] = df[idx].max(axis=1)
    df['mean'] = df[idx].mean(axis=1)
    df['std'] = df[idx].std(axis=1)
    df['skew'] = df[idx].skew(axis=1)
    df['kurt'] = df[idx].kurtosis(axis=1)
    df['med'] = df[idx].median(axis=1)

In [None]:
# declare new dataset
variables = train.columns[2:]
X_train_nf = train[variables]
# expand dataset
X_tr, y_tr = feature_creation(X_train_nf.values, y_train.values)
X_tr = pd.DataFrame(X_tr)
y_tr = pd.DataFrame(y_tr)
X_tr.tail()

In [None]:
%%time
train_pred_gaussian, acc_gaussian, acc_cv_gaussian, auc_gaussian = fit_ml_algo(GaussianNB(), 
                                                                      X_tr, 
                                                                      y_tr,
                                                                      X_test,   
                                                                           10)

print("Accuracy: {}".format(acc_gaussian))
print("Accuracy CV 10-Fold: {}".format(acc_cv_gaussian))
print("AUC score: {}".format(auc_gaussian))

Accuracy scores

In [None]:
models = pd.DataFrame({
    'Model': ['Decision Tree', 
              'Logistic Regression', 
              'Naive Bayes'
             ],
    'Score': [
        acc_dt, 
        acc_log,  
        acc_gaussian,
    ],
    'AUC_CV': [
        auc_dt_cv,
        auc_log_cv,
        auc_gaussian_cv,
    ]})
print("---Reuglar Accuracy Scores---")
models.sort_values(by='Score', ascending=False)

* Light GBM

In [None]:
param = {
    'bagging_freq': 5,
    'bagging_fraction': 0.335,
    'boost_from_average':'false',
    'boost': 'gbdt',
    'feature_fraction': 0.041,
    'learning_rate': 0.0083,
    'max_depth': -1,
    'metric':'auc',
    'min_data_in_leaf': 80,
    'min_sum_hessian_in_leaf': 10.0,
    'num_leaves': 13,
    'num_threads': 8,
    'tree_learner': 'serial',
    'objective': 'binary', 
    'verbosity': -1
}

In [None]:
num_folds = 12

folds = StratifiedKFold(n_splits=num_folds, shuffle=False, random_state=2319)
oof = np.zeros(len(train))
getVal = np.zeros(len(train))
predictions = np.zeros(len(y_train))

print('Light GBM Model')
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train.values, y_train.values)):
    print("Fold idx:{}".format(fold_ + 1))
    trn_data = lgb.Dataset(train.iloc[trn_idx][features], label=y_train.iloc[trn_idx])
    val_data = lgb.Dataset(train.iloc[val_idx][features], label=y_train.iloc[val_idx])
    
    clf = lgb.train(param, trn_data, 1000000, valid_sets = [trn_data, val_data], verbose_eval=5000, early_stopping_rounds = 4000)
    oof[val_idx] = clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    getVal[val_idx]+= clf.predict(train.iloc[val_idx][features], num_iteration=clf.best_iteration) / folds.n_splits
    predictions += clf.predict(test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(roc_auc_score(y_train, oof)))

Feature creation

* Light GBM