In [6]:
import numpy as np
import pandas as pd
import os
from lightgbm.sklearn import LGBMClassifier
from sklearn.preprocessing import LabelEncoder


def column_check(data1, data2):
    return set(data1.columns) == set(data1.columns).intersection(data2.columns)


def merge_duplicate_group_cols(data):
    if 'num_group1_x' in data.columns:
        data.loc[:, 'num_group1_x'] = data.loc[:, 'num_group1_x'].fillna(data.loc[:, 'num_group1_y'])
        data.drop('num_group1_y', axis=1, inplace=True)
        data.rename(columns={'num_group1_x': 'num_group1'}, inplace=True)

    if 'num_group2_x' in data.columns:
        data.loc[:, 'num_group2_x'] = data.loc[:, 'num_group2_x'].fillna(data.loc[:, 'num_group2_y'])
        data.drop('num_group2_y', axis=1, inplace=True)
        data.rename(columns={'num_group2_x': 'num_group2'}, inplace=True)
    return data


def concat_and_merge(base_data, path_list, base_path, rows, cols_to_merge=None):
    data = pd.DataFrame()
    for file in path_list:
        if cols_to_merge:
            # I specify whether read all columns or a given list of columns.
            data = pd.concat([data, pd.read_csv(base_path + file, usecols=['case_id'] + cols_to_merge, nrows=rows,
                                                low_memory=False)], axis=0)
        else:
            data = pd.concat([data, pd.read_csv(base_path + file, nrows=rows, low_memory=False)], axis=0)

    data.drop_duplicates(subset=['case_id'], keep='first', inplace=True)
    base_data = base_data.merge(data, on='case_id', how='left')

    return merge_duplicate_group_cols(base_data)


def get_file_names(file_names, keyword):
    return [x for x in file_names if keyword in x]


nrows = None
files_path = 'csv_files/train/'

files = os.listdir(files_path)

base_file = ['train_base.csv']
applprev_files = get_file_names(files, 'applprev_1')
credit_a1_files = get_file_names(files, 'credit_bureau_a_1')
credit_a2_files = get_file_names(files, 'credit_bureau_a_2')
credit_b_files = get_file_names(files, 'credit_bureau_b')
static0_files = get_file_names(files, 'static_0')
rest_of_files = set(files) - set(applprev_files + credit_a1_files + credit_a2_files + credit_b_files +
                                 static0_files + base_file)

appl_features = ['creationdate_885D']
credit_a1_features = ['numberofoverdueinstlmax_1039L', 'financialinstitution_591M', 'dpdmaxdateyear_596T',
                      'dateofcredstart_739D', 'dateofcredend_289D', 'lastupdate_1112D']
static0_features = ['price_1097A', 'isbidproduct_1095L', 'numinstlswithdpd10_728L', 'lastapprdate_640D',
                    'lastactivateddate_801D', 'mobilephncnt_593L', 'pmtnum_254L']

rest_of_features = ['incometype_1044T', 'birth_259D', 'dateofbirth_337D', 'registaddr_zipcode_184M', 'sex_738L',
                    'contaddr_zipcode_807M']

base = pd.read_csv(files_path + base_file[0], nrows=nrows)
base = concat_and_merge(base, applprev_files, files_path, nrows, appl_features)
base = concat_and_merge(base, credit_a1_files, files_path, nrows, credit_a1_features)
base = concat_and_merge(base, static0_files, files_path, nrows, static0_features)

for file in rest_of_files:
    data = pd.read_csv(files_path + file, nrows=nrows, low_memory=False)
    data.drop_duplicates(subset=['case_id'], keep='first', inplace=True)
    base = base.merge(data, on='case_id', how='left')
    base = merge_duplicate_group_cols(base)

del data

all_features = ['case_id', 'target', 'num_group1', 'date_decision'] + appl_features+credit_a1_features+static0_features+rest_of_features

base = base.drop(columns=base.columns.difference(all_features))

columns_to_fit = base.columns.drop(['case_id', 'target', 'num_group1', 'date_decision']).tolist()

object_cols = base[columns_to_fit].select_dtypes(include=['object']).columns
object_cols = object_cols.tolist()

encoders = dict()
for col in object_cols:
    le = LabelEncoder()
    base[col] = le.fit_transform(base[col])
    encoders[col] = le

lgb = LGBMClassifier(n_estimators=5000, scale_pos_weight=0.033)
lgb.fit(base[columns_to_fit], base['target'], categorical_feature=object_cols)

del base

[LightGBM] [Info] Number of positive: 47994, number of negative: 1478665
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.104057 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23489
[LightGBM] [Info] Number of data points in the train set: 1526659, number of used features: 20
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031437 -> initscore=-3.427819
[LightGBM] [Info] Start training from score -3.427819


In [7]:
#==============================================
# test
#==============================================

nrows = None
files_path = 'csv_files/test/'

files = os.listdir(files_path)

base_file = ['test_base.csv']
applprev_files = get_file_names(files, 'applprev_1')
credit_a1_files = get_file_names(files, 'credit_bureau_a_1')
credit_a2_files = get_file_names(files, 'credit_bureau_a_2')
credit_b_files = get_file_names(files, 'credit_bureau_b')
static0_files = get_file_names(files, 'static_0')
rest_of_files = set(files) - set(applprev_files + credit_a1_files + credit_a2_files + credit_b_files +
                                 static0_files + base_file)

base = pd.read_csv(files_path + base_file[0], nrows=nrows)
base = concat_and_merge(base, applprev_files, files_path, nrows, appl_features)
base = concat_and_merge(base, credit_a1_files, files_path, nrows, credit_a1_features)
base = concat_and_merge(base, static0_files, files_path, nrows, static0_features)

for file in rest_of_files:
    data = pd.read_csv(files_path + file, nrows=nrows, low_memory=False)
    data.drop_duplicates(subset=['case_id'], keep='first', inplace=True)
    base = base.merge(data, on='case_id', how='left')
    base = merge_duplicate_group_cols(base)

del data

base = base.drop(columns=base.columns.difference(all_features))

for col in object_cols:
    base[col] = base[col].map(lambda s: 'unknown' if s not in encoders[col].classes_ else s)
    encoders[col].classes_ = np.append(encoders[col].classes_, 'unknown')  # Add 'unknown' to classes
    base[col] = encoders[col].transform(base[col])

submission = base[['case_id']]
submission.loc[:, 'score'] = lgb.predict_proba(base[columns_to_fit])[:, 1]
submission = submission.set_index('case_id')
submission.to_csv('./submission.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission.loc[:, 'score'] = lgb.predict_proba(base[columns_to_fit])[:, 1]
