###Data Importing

In [None]:
import sys
import gc
import os
import numpy as np
import pandas as pd
import lightgbm as lgb
from scipy.stats import rankdata
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from multiprocessing import cpu_count
from tqdm import tqdm
import seaborn as sns

In [None]:
#Attaching google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Uploading the Kaggle authentication file
from google.colab import files

uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [None]:
#Verifying file upload
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
#Moving kaggle.json into directory from where data is read
!mkdir -p ~/.kaggle/ && mv kaggle.json ~/.kaggle/ && chmod 600 ~/.kaggle/kaggle.json

User uploaded file "kaggle.json" with length 71 bytes
mv: cannot stat 'kaggle.json': No such file or directory


In [None]:
#getting files from kaggle
!kaggle competitions download -c 'santander-customery-transaction-prediction'

sample_submission.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
train.csv.zip: Skipping, found more recently modified local copy (use --force to force download)
test.csv.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
#unzipping files
!unzip train.csv.zip
!unzip test.csv.zip

Archive:  train.csv.zip
  inflating: train.csv               
Archive:  test.csv.zip
  inflating: test.csv                


In [None]:
#reading in the data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
train_cp = train.copy()
test_cp = test.copy()

In [None]:
#Creating training and testing sets from the data
train_y = train_cp['target'].values
train_X_column_name = train_cp.drop(['target', 'ID_code'], axis=1).columns
train_X = train_cp.drop(['target', 'ID_code'], axis=1).values
test_X = test_cp.drop(['ID_code'], axis=1).values

In [None]:
#Creating for model exploration
train_df = train.copy()
test_df = test.copy()


###Feature Engineering

In [None]:
#Declaring augmentation file location
py_file_location = "/content/drive/MyDrive/Colab Notebooks"
sys.path.append(os.path.abspath(py_file_location))

#Importing the augment function from the file
from augmentation import augment

In [None]:
#Creating Real and  Synthetic Samples for test data (Removing Fake Data)
tmp_test = test_df.drop(['ID_code'], axis=1).values

unique_count = np.zeros_like(tmp_test)
for ft in range(tmp_test.shape[1]):
    _, index_, count_ = np.unique(tmp_test[:, ft],  \
                                  return_index=True, return_counts=True)
    unique_count[index_[count_ == 1], ft] += 1

real_samples = np.argwhere(np.sum(unique_count, axis=1) > 0)[:, 0]
synthetic_samples = np.argwhere(np.sum(unique_count, axis=1) == 0)[:, 0]


In [None]:
print('real test:', len(real_samples))
print('syn test:', len(synthetic_samples))

real test: 100000
syn test: 100000


In [None]:
#Data upsampled check
print('train',train_df.shape)
print('test',test_df.shape)

train (200000, 202)
test (200000, 201)


In [None]:
#Increasing the number of features to improve model
features = [col for col in train_df.columns if col.startswith('var')]
df_all = pd.concat([train_df, test_df.iloc[real_samples]])

for feat in features:
  temp = df_all[feat].value_counts(dropna=True)

  train_df[feat + 'vc'] = train_df[feat].map(temp) .map(lambda \
                                                        x: min(10, x)).astype(np.uint8)
  test_df[feat + 'vc'] = test_df[feat].map(temp).map(lambda \
                                                     x: min(10, x)).astype(np.uint8)

  train_df[feat + 'sum'] = ((train_df[feat] - df_all[feat].mean()) * \
                            train_df[feat + 'vc'].map(lambda x: int(x > 1))).astype(np.float32)
  test_df[feat + 'sum'] = ((test_df[feat] - df_all[feat].mean()) * \
                           test_df[feat + 'vc'].map(lambda x: int(x > 1))).astype(np.float32)

  train_df[feat + 'sum2'] = ((train_df[feat]) *  \
                             train_df[feat + 'vc'].map(lambda  \x: int(x > 2))).astype(np.float32)
  test_df[feat + 'sum2'] = ((test_df[feat]) *  \
                            test_df[feat + 'vc'].map(lambda x: int(x > 2))).astype(np.float32)

  train_df[feat + 'sum3'] = ((train_df[feat]) *  \
                             train_df[feat + 'vc'].map(lambda x: int(x > 4))).astype(np.float32)
  test_df[feat + 'sum3'] = ((test_df[feat]) *  \
                            test_df[feat + 'vc'].map(lambda x: int(x > 4))).astype(np.float32)

In [None]:
#After Increasing the number of features
print('train',train_df.shape)
print('test',test_df.shape)

train (200000, 1002)
test (200000, 1001)


### Finding Best Parameters with GridSearch

In [None]:
#training and testing data
predictors = train_df.columns.to_list()[2:]
X_test = test_df[predictors]

n_splits = 5
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

object_of = train_df[['ID_code', 'target']]
object_of['predict'] = 0
predictions = test_df[['ID_code']]
val_aucs = []
feature_importance_df = pd.DataFrame()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':


In [None]:
for fold, (trn_ind, val_ind) in enumerate(k_fold.split(train_df, train_df.target.values)):
  X_train, y_train = train_df.iloc[trn_ind][predictors], train_df.iloc[trn_ind]['target']
  X_valid, y_valid = train_df.iloc[val_ind][predictors], train_df.iloc[val_ind]['target']

In [None]:
#Parameters for searching
SEED =[50]

gbdt_param = {
    'objective': ['binary'],
    'boosting': ['gbdt','dart'],
    'learning_rate': [0.01,0.05,0.1],
    'num_leaves': [15,30,45],
    'tree_learner': ['serial'],
    'num_threads': [-1],
    'seed': SEED,

    'max_depth': [-1],
    'min_data_in_leaf': [50],
    'min_sum_hessian_in_leaf': [10],
    'bagging_fraction': [0.4,0.6,0.8,0.5],
    'bagging_freq': [2,5,10,15],
    'feature_fraction': [0.05,0.1,0.5],
    'lambda_l1': [1],
    'bagging_seed': SEED,

    'verbosity': [1],
    'boost_from_average': [False],
    'metric': ['auc']
}

scoring = 'roc_auc'

In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
gkf = KFold(n_splits=5, shuffle=True, random_state=42)

lgb_estimator = lgb.LGBMClassifier( objective='binary', \
                                   num_boost_round=2000, \
                                   learning_rate=0.01, metric='auc')

gsearch = GridSearchCV(estimator=lgb_estimator, \
                       param_grid=gbdt_param, cv=gkf,refit=True)
lgb_model = gsearch.fit(X_train,y_train)



###Final Model training

In [None]:
#Using the best parameters found by GridSearchCV
SEED = 40
gbdt_param = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'learning_rate': 0.01,
    'num_leaves': 15,
    'tree_learner': 'serial',
    'num_threads': 8,
    'seed': SEED,
    'max_depth': -1,
    'min_data_in_leaf': 50,
    'min_sum_hessian_in_leaf': 10,
    'bagging_fraction': 0.6,
    'bagging_freq': 5,
    'feature_fraction': 0.05,
    'lambda_l1': 1.,
    'bagging_seed': SEED,
    'verbosity': 1,
    'boost_from_average': False,
    'metric': 'auc'

}

In [None]:
predictors = train_df.columns.to_list()[2:]
X_test = test_df[predictors]

n_splits = 5
k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

object_of = train_df[['ID_code', 'target']]
object_of['predict'] = 0
predictions = test_df[['ID_code']]




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
#Training the model
for fold, (trn_ind, val_ind) in enumerate(k_fold.split(train_df, train_df.target.values)):
    X_train, y_train = train_df.iloc[trn_ind][predictors], train_df.iloc[trn_ind]['target']
    X_valid, y_valid = train_df.iloc[val_ind][predictors], train_df.iloc[val_ind]['target']
    
    pred_valid,pred_y = 0,0
    auc_scores = []
    
    print('\nFold {} - N {}'.format(fold +1, 1+1))
    aug_x_t, aug_y_t = augment(X_train.values, y_train.values)
    weights = np.array([0.8] * X_t.shape[0])
    weights[:X_train.shape[0]] = 1.0

    aug_x_t = pd.DataFrame(aug_x_t)
    aug_x_t = Xaug_x_t_t.add_prefix('var_')

    data_training = lgb.Dataset(aug_x_t, label=y_t, weight=weights)
    data_validation = lgb.Dataset(X_valid, label=y_valid)
    eval_results = {}
    lgb_clf = lgb.train(gbdt_param,
                        data_training,
                        10000,
                        valid_sets = [data_training, data_validation],
                        early_stopping_rounds=3000,
                        verbose_eval = 500,
                        evals_result=eval_results
                       )
    pred_valid += lgb_clf.predict(X_valid)
    pred_y += lgb_clf.predict(X_test)

    df_fld_imp = pd.DataFrame()
    fold_importance_df["feature"] = predictors
    fold_importance_df["importance"] = lgb_clf.feature_importance()
    fold_importance_df["fold"] = fold + 1

    df_feat_imp = pd.DataFrame()
    df_feat_imp = pd.concat([df_feat_imp, df_fld_imp], axis=0)

    object_of['predict'][val_ind] = pred_valid/1
    val_score = roc_auc_score(y_valid, pred_valid)
    auc_scores.append(val_score)
    
    predictions['fold{}'.format(fold+1)] = pred_y/1


Fold 1 - N 1
Training until validation scores don't improve for 3000 rounds.
[500]	training's auc: 0.951015	valid_1's auc: 0.874454
[1000]	training's auc: 0.9597	valid_1's auc: 0.890503
[1500]	training's auc: 0.965021	valid_1's auc: 0.90089
[2000]	training's auc: 0.968826	valid_1's auc: 0.908236
[2500]	training's auc: 0.971596	valid_1's auc: 0.912964
[3000]	training's auc: 0.973783	valid_1's auc: 0.916534
[3500]	training's auc: 0.975523	valid_1's auc: 0.919115
[4000]	training's auc: 0.976946	valid_1's auc: 0.920994
[4500]	training's auc: 0.97818	valid_1's auc: 0.922301
[5000]	training's auc: 0.979264	valid_1's auc: 0.923363
[5500]	training's auc: 0.980275	valid_1's auc: 0.924236
[6000]	training's auc: 0.981166	valid_1's auc: 0.924881
[6500]	training's auc: 0.982009	valid_1's auc: 0.925218
[7000]	training's auc: 0.982798	valid_1's auc: 0.925688
[7500]	training's auc: 0.983555	valid_1's auc: 0.925946
[8000]	training's auc: 0.984292	valid_1's auc: 0.926143
[8500]	training's auc: 0.985008

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_labels(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Fold 2 - N 1
Training until validation scores don't improve for 3000 rounds.
[500]	training's auc: 0.951548	valid_1's auc: 0.859882
[1000]	training's auc: 0.960394	valid_1's auc: 0.877487
[1500]	training's auc: 0.965628	valid_1's auc: 0.888678
[2000]	training's auc: 0.969445	valid_1's auc: 0.896966
[2500]	training's auc: 0.972229	valid_1's auc: 0.903045
[3000]	training's auc: 0.974334	valid_1's auc: 0.907135
[3500]	training's auc: 0.976049	valid_1's auc: 0.910195
[4000]	training's auc: 0.977461	valid_1's auc: 0.912251
[4500]	training's auc: 0.978667	valid_1's auc: 0.913984
[5000]	training's auc: 0.979755	valid_1's auc: 0.915294
[5500]	training's auc: 0.980715	valid_1's auc: 0.916253
[6000]	training's auc: 0.981591	valid_1's auc: 0.916971
[6500]	training's auc: 0.982407	valid_1's auc: 0.917528
[7000]	training's auc: 0.983185	valid_1's auc: 0.917897
[7500]	training's auc: 0.983926	valid_1's auc: 0.918245
[8000]	training's auc: 0.984649	valid_1's auc: 0.918555
[8500]	training's auc: 0.98

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_labels(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Fold 3 - N 1
Training until validation scores don't improve for 3000 rounds.
[500]	training's auc: 0.951346	valid_1's auc: 0.869163
[1000]	training's auc: 0.959925	valid_1's auc: 0.885842
[1500]	training's auc: 0.965257	valid_1's auc: 0.896398
[2000]	training's auc: 0.969061	valid_1's auc: 0.903793
[2500]	training's auc: 0.97184	valid_1's auc: 0.909148
[3000]	training's auc: 0.973948	valid_1's auc: 0.913023
[3500]	training's auc: 0.975703	valid_1's auc: 0.915683
[4000]	training's auc: 0.977097	valid_1's auc: 0.917551
[4500]	training's auc: 0.978337	valid_1's auc: 0.919027
[5000]	training's auc: 0.97941	valid_1's auc: 0.92005
[5500]	training's auc: 0.980382	valid_1's auc: 0.920852
[6000]	training's auc: 0.981292	valid_1's auc: 0.921453
[6500]	training's auc: 0.982136	valid_1's auc: 0.921978
[7000]	training's auc: 0.982932	valid_1's auc: 0.922312
[7500]	training's auc: 0.983685	valid_1's auc: 0.92258
[8000]	training's auc: 0.984435	valid_1's auc: 0.922723
[8500]	training's auc: 0.985148

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_labels(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Fold 4 - N 1
Training until validation scores don't improve for 3000 rounds.
[500]	training's auc: 0.950892	valid_1's auc: 0.870614
[1000]	training's auc: 0.95987	valid_1's auc: 0.88558
[1500]	training's auc: 0.965291	valid_1's auc: 0.895816
[2000]	training's auc: 0.969077	valid_1's auc: 0.903557
[2500]	training's auc: 0.971791	valid_1's auc: 0.908804
[3000]	training's auc: 0.974001	valid_1's auc: 0.912658
[3500]	training's auc: 0.975703	valid_1's auc: 0.915312
[4000]	training's auc: 0.977102	valid_1's auc: 0.917238
[4500]	training's auc: 0.978329	valid_1's auc: 0.918893
[5000]	training's auc: 0.979394	valid_1's auc: 0.920081
[5500]	training's auc: 0.980371	valid_1's auc: 0.920944
[6000]	training's auc: 0.98127	valid_1's auc: 0.921635
[6500]	training's auc: 0.982111	valid_1's auc: 0.922202
[7000]	training's auc: 0.982909	valid_1's auc: 0.922636
[7500]	training's auc: 0.983676	valid_1's auc: 0.922962
[8000]	training's auc: 0.984423	valid_1's auc: 0.923207
[8500]	training's auc: 0.98514

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_labels(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Fold 5 - N 1
Training until validation scores don't improve for 3000 rounds.
[500]	training's auc: 0.951059	valid_1's auc: 0.867644
[1000]	training's auc: 0.960012	valid_1's auc: 0.884072
[1500]	training's auc: 0.965355	valid_1's auc: 0.89448
[2000]	training's auc: 0.969098	valid_1's auc: 0.902019
[2500]	training's auc: 0.971822	valid_1's auc: 0.907624
[3000]	training's auc: 0.974009	valid_1's auc: 0.911432
[3500]	training's auc: 0.975694	valid_1's auc: 0.914227
[4000]	training's auc: 0.977098	valid_1's auc: 0.916299
[4500]	training's auc: 0.978295	valid_1's auc: 0.917868
[5000]	training's auc: 0.979361	valid_1's auc: 0.919198
[5500]	training's auc: 0.980319	valid_1's auc: 0.920249
[6000]	training's auc: 0.981194	valid_1's auc: 0.921031
[6500]	training's auc: 0.982037	valid_1's auc: 0.921608
[7000]	training's auc: 0.98283	valid_1's auc: 0.922083
[7500]	training's auc: 0.983585	valid_1's auc: 0.922417
[8000]	training's auc: 0.984324	valid_1's auc: 0.922702
[8500]	training's auc: 0.9850

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_labels(key, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


###Submission DF

In [None]:
predictions['target'] = np.mean(predictions[[col for col in predictions.columns if col not in ['ID_code','target']]].values, axis=1)\
predictions.to_csv('predictions.csv',index=None)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
final_df = pd.DataFrame({'ID_code':test_df['ID_code'].values})
final_df['target'] = predictions['target']
final_df.to_csv("another_lgb_submission.csv", index = False)