In [None]:
%cd /content/drive/My Drive/Colab Notebooks

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.model_selection import train_test_split
!pip install impyute
from impyute.imputation.cs import mice
import seaborn as sns
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from sklearn.feature_selection import mutual_info_classif, GenericUnivariateSelect

In [None]:

df_X_og = pd.read_csv('/content/drive/My Drive/Colab Notebooks/X_train.csv')
df_Y_og = pd.read_csv('/content/drive/My Drive/Colab Notebooks/y_train.csv')
df_X = df_X_og.copy()
df_Y = df_Y_og.copy()
print(df_X.shape)
print(df_Y.shape)
df_test_og = pd.read_csv('/content/drive/My Drive/Colab Notebooks/X_test.csv')
df_test = df_test_og.copy()
print(df_test.shape)

## Greeting the data

In [None]:
df_X.head()
#df_Y.head()

Just by glancing over the difference in 75% and max values it is clearly visible that some columns are highly skewed

In [None]:
df_X.describe()

Transforming the categorical int column types to object and bool to int

In [None]:
dfs = [df_X, df_test]
print(df_X.info())
print(df_test.info())
for df in dfs:
  categorical_col = df.columns[df.columns.str.startswith('C')]
  for col in categorical_col:
    if df[col].dtypes == bool:
      df[col] = df[col].astype(int)
    else:
      df[col] = df[col].astype(str)

## Imputing

The stategy I used for imputing data is MICE(Multivariate Imputation by Chained Equations)using impyute module. I tried applying it to the whole data frame but it was super slow. There are few columns with more than 80% missing values, I am aware I could substitute the missing values with some string and binned the rest of the data finally hot encoding those columns but here I preferred to just drop them. After dropping them I tried MICE on rest of the dataframe but it was still slow so I decided to use median to impute the mising values in columns with missing percentage less than 13%. Then I applied MICE on the dataframe.

In [None]:
for df in dfs:
  all_missing_data = (df.isnull().sum()/len(df))*100
  all_missing_data = all_missing_data.drop(all_missing_data[all_missing_data == 0].index).sort_values(ascending = False)
  print(all_missing_data)
  cols = all_missing_data[all_missing_data.values < 13].index
  for col in cols:
    df[col] = df[col].fillna(df[col].median())
  cols_emp = all_missing_data[all_missing_data.values > 80].index
  df.drop(cols_emp, axis = 1, inplace = True)

In [None]:
for df in dfs:
  num_cols = df.select_dtypes(exclude=['object']).columns
  filled = mice(df.loc[:,num_cols].values, seed = 103)
  df_temp = pd.DataFrame(data = filled)
  df_temp.columns = num_cols
  df.update(df_temp)

In [None]:
df_train = pd.merge(df_X, df_Y, on = 'Unique_ID', how = 'inner')

I tried to boxcox the tackle the skewness but there are ton of negative values and boxcox1p return NaN for them. There is an alternate by using log(x - min value of column) but I skipped that for now.

In [None]:
'''dfs = [df_train, df_test]
for df in dfs:
  non_obj_col = df.columns[df.columns.str.startswith('N')]
  skew_feat_val = df[non_obj_col].apply(lambda x : skew(x)).sort_values(ascending = False)
  print(skew_feat_val)
  skew_feat_val = skew_feat_val[abs(skew_feat_val) > 0.75]
  lam = 0.3
  for feature in skew_feat_val.index:
    df[feature] = boxcox1p(df[feature], lam)'''

# Feature Selection

In [None]:
plt.figure(figsize = (20, 20))
sns.heatmap(df_train.corr(), annot = True, square = True)
plt.figure(figsize = (20, 20))
sns.heatmap(df_test.corr(), annot = True, square = True)

Removing the features with correlation more than 0.85

In [None]:
dfs = [df_train, df_test]
for df in dfs:
  correlated_features = set()
  corr_matrix = df.corr()
  for i in range(len(corr_matrix.columns)):
      for j in range(i):
          if abs(corr_matrix.iloc[i, j]) > 0.85:
            col_name = corr_matrix.columns[i]
            correlated_features.add(col_name)
  print(correlated_features)
  df.drop(list(correlated_features), axis = 1, inplace = True)

Getting the mutual information of the columns with dependent variable 

In [None]:
mutual_info = mutual_info_classif(df_train.iloc[:, 1: -1],df_train.iloc[:,-1])
plt.subplots(1, figsize=(26, 1))
sns.heatmap(mutual_info[:, np.newaxis].T, cmap='Blues', cbar=False, linewidths=1, annot=True)
plt.yticks([], [])
plt.gca().set_xticklabels(df_train.columns[1:-1], rotation=45, ha='right', fontsize=12)
plt.suptitle('Variable Importance (mutual_info_classif)', fontsize=18, y=1.2)
plt.gcf().subplots_adjust(wspace=0.2)

Keeping only the top 80 percentile of the columns using mutual information

In [None]:
trans = GenericUnivariateSelect(score_func=mutual_info_classif, mode='percentile', param=80)
train_trans = trans.fit_transform(df_train.iloc[:, 1: -1], df_train.iloc[:,-1])
columns_retained_Select = df_train.iloc[:, 1:-1].columns[trans.get_support()].values
print(columns_retained_Select)
df_train_trans = df_train[columns_retained_Select]
df_test_noID = df_test[df_train_trans.columns]

In [None]:
df_train_trans.shape

One hot encoding the categorical columns

In [None]:
encoder = OneHotEncoder(handle_unknown = 'ignore', sparse = False)
encoder.fit(df_train_trans[df_train_trans.select_dtypes(include = 'object').columns])
X_trans = encoder.transform(df_train_trans[df_train_trans.select_dtypes(include = 'object').columns])
test_trans = encoder.transform(df_test_noID[df_test_noID.select_dtypes(include = 'object').columns])
X = pd.DataFrame(X_trans)
test = pd.DataFrame(test_trans)
X = pd.concat([X, df_train_trans], axis = 1)
test = pd.concat([test, df_test_noID], axis = 1)
X.drop(df_train_trans.select_dtypes(include = 'object').columns, axis = 1, inplace = True)
test.drop(df_test_noID.select_dtypes(include = 'object').columns, axis = 1, inplace = True)

In [None]:
X.shape

Since there are outliers in the data it is scaled with Robust Scaler

In [None]:
scaler = RobustScaler().fit(X)
X_scaled = scaler.transform(X)
test_scaled = scaler.transform(test)

In [None]:
train_X_scaled, cv_X_scaled, train_y, cv_y = train_test_split(X_scaled, df_train['Dependent_Variable'], test_size = 0.2)

In [None]:
train_X, cv_X, train_y, cv_y = train_test_split(X, df_train['Dependent_Variable'], test_size = 0.2)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, ShuffleSplit, RandomizedSearchCV, cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score, auc
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from scipy.stats import randint, uniform

Logistic regression gives out a very bad score

In [None]:
param_grid = [{'C' : [100]}]
cv_split = ShuffleSplit(n_splits = 5, test_size = 0.2, train_size = 0.75, random_state = 0)
best_search = GridSearchCV(estimator = LogisticRegression(max_iter = 10000), param_grid = param_grid, cv = cv_split, scoring = 'roc_auc')
best_search.fit(train_X_scaled, train_y)
print(best_search.best_score_)
print(best_search.best_params_)

Since there are too many rows SVC takes impractically long and cant be used. No point in using SVClinear since linear function wont fit this data. 

In [None]:
'''param_grid = [{'C' : [0.1, 1, 10]}]
cv_split = ShuffleSplit(n_splits = 5, test_size = 0.2, train_size = 0.75, random_state = 0)
best_search = GridSearchCV(estimator = SVC(), param_grid = param_grid, cv = cv_split, scoring = 'roc_auc')
best_search.fit(train_X_scaled, train_y)
print(best_search.best_score_)
print(best_search.best_params_)'''

Defining scoring function

In [None]:
def scoring(score):
    fpr,tpr, thresholds = roc_curve(cv_y, score[:,1])
    return(auc(fpr, tpr))

Using XGB with randomized search

In [None]:
random_grid_xgb = {
        'silent': [False],
        'max_depth': [6, 10, 15, 20],
        'learning_rate': [0.001, 0.01, 0.1, 0.2, 0,3],
        'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bytree': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'colsample_bylevel': [0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
        'min_child_weight': [0.5, 1.0, 3.0, 5.0, 7.0, 10.0],
        'gamma': [0, 0.25, 0.5, 1.0],
        'reg_lambda': [0.1, 1.0, 5.0, 10.0, 50.0, 100.0],
        'n_estimators': [100, 300]}

In [None]:
cv_split = ShuffleSplit(n_splits = 5, test_size = 0.2, train_size = 0.75, random_state = 0)
xgb_random = RandomizedSearchCV(estimator = XGBClassifier(random_state = 38), param_distributions = random_grid_xgb, 
                               scoring = 'roc_auc', n_iter = 60, cv = cv_split, verbose=3, random_state=38, n_jobs = -1)
xgb_random.fit(train_X, train_y)
print(xgb_random.best_score_)
print(xgb_random.best_params_)

In [None]:
score_xgb_random = xgb_random.predict_proba(cv_X)
print(scoring(score_xgb_random))

Randomforestclassifier with randomized search

In [None]:
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
cv_split = ShuffleSplit(n_splits = 5, test_size = 0.2, train_size = 0.75, random_state = 0)
rf_random = RandomizedSearchCV(estimator = RandomForestClassifier(random_state = 42), param_distributions = random_grid, 
                               n_iter = 100, cv = cv_split, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(train_X, train_y)
print(rf_random.best_score_)
print(rf_random.best_params_)

In [None]:
score_rf_random = rf_random.predict_proba(cv_X)
print(scoring(score_rf_random))

LGBM classifier with randomized search

In [None]:
random_grid_lgbm ={'num_leaves': randint(6, 50), 
                   'learning_rate' : [0.001, 0.01, 0.1],
             'min_child_samples': randint(20, 500), 
             'min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'subsample': uniform(0.4, 0.8), 
             'colsample_bytree': uniform(0.4, 0.6),
             'reg_alpha': [0, 1e-1, 1, 2, 5, 7, 10, 50, 100],
             'reg_lambda': [0, 1e-1, 1, 5, 10, 20, 50, 100]}

In [None]:
cv_split = ShuffleSplit(n_splits = 5, test_size = 0.2, train_size = 0.75, random_state = 0)
lgbm_random = RandomizedSearchCV(estimator = LGBMClassifier(max_depth=-1, random_state=50, silent=False, n_estimators=5000, bagging_fraction = 1), param_distributions = random_grid_lgbm, 
                               scoring = 'roc_auc', n_iter = 60, cv = cv_split, verbose=2, random_state=50, n_jobs = -1)
lgbm_random.fit(train_X, train_y)
print(lgbm_random.best_score_)
print(lgbm_random.best_params_)

In [None]:
score_lgbm = lgbm_random.predict_proba(cv_X)
print(scoring(score_lgbm))

Neural Network with keras

In [None]:
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
import tensorflow as tf
from tensorflow.keras.metrics import AUC

In [None]:
train_X.shape[1]

In [None]:
def neural_net():
  model = Sequential()
  model.add(Dense(180, input_dim=train_X.shape[1], activation='relu'))
  model.add(BatchNormalization())
  model.add(Dropout(0.3))
  model.add(Dense(100, activation='relu'))
  model.add(BatchNormalization())
  model.add(Dropout(0.3))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=[AUC()])
  return model
estimator = KerasClassifier(build_fn=neural_net, epochs=10000, batch_size=30000, verbose=1)
history = estimator.fit(train_X, train_y)

In [None]:
score_neural_net = estimator.predict_proba(cv_X)
print(scoring(score_neural_net))

The best CV sroc_auc score was from LGBM

In [None]:
out_score = lgbm_random.predict_proba(test)
df = pd.DataFrame()
df['Unique_ID'] = df_test['Unique_ID']
df['Class_1_Probability'] = out_score[:,1]
df.to_csv('submission_file.csv', index = False)