__This file contains codes generating new train and test dataset by removing leaked rows from train set or adding leaked rows from test set to train set. It also contains codes that combine predictions by machine learning models and predictions by leak filtered in different ways. So there are no relations between different sections.__

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error

### 1. Combine leak and prediction

In [2]:
# Read test data and test leak
test_leak = pd.read_csv('./leak_prediction_using_all_leaks.csv')
test = pd.read_csv('./test.csv')

In [None]:
# Read predictions by machine learning models
test_prediction = pd.read_csv('./xgb+lgb.csv')

In [None]:
# Combine leak and prediction
result = test_prediction
result.loc[test_leak['compiled_leak'].notna(),'target'] = test_leak.loc[test_leak['compiled_leak'].notna(),'compiled_leak']
result.to_csv('prediction_and_leak.csv',index=False)

### 2. Remove leaked rows from train set

leak rows may actually from the same or just few numbers of customers. I tried to build models again upon a new train set after leaked rows are removed.

In [None]:
# Read train data and train leak
train_leak = pd.read_csv('./leak/train_leak.csv')
train_leak.head(10)
train = pd.read_csv('./train.csv')

In [None]:
print('Accuracy of predictions by leak:',end=' ')
print(sum(train_leak['compiled_leak'] == train['target'])/sum(train_leak['compiled_leak'].notna()))

In [None]:
train = train.loc[train_leak['compiled_leak'].isna()]

In [10]:
train.to_csv('leak_removed_from_train.csv',index=False)

### 3. Add leaked rows from test set to train set

For leaked rows in test set, target values we get according to leak rules should be generally right. Therefore, I can try to move these rows into train set.

In [None]:
test['target'] = test_leak['compiled_leak']
item_leaked = test[test['target'].notna()]
test_new = test[test['target'].isna()] # Leaked rows are removed from test set
train_new = pd.concat([train,item_leaked]).reset_index(drop=True) # Leaked rows from test set are added to train set
print('Shape of the new train set: {}*{}'.format(train_new.shape[0],train_new.shape[1]))
print('Shape of the new test set: {}*{}'.format(test_new.shape[0],test_new.shape[1]))

In [None]:
train_new.to_csv('train_new.csv',index=False)
test_new.to_csv('test_new.csv',index=False)

In [53]:
# Generate compiled_leak for test again without setting the cutoff of best lag
test_leak['compiled_leak'] = np.nan
max_lag = 38

for i in np.arange(max_lag):
    c = test_leak['compiled_leak'].isna()
    leak_target = 'leaked_target_'+str(i)
    test_leak.loc[c,'compiled_leak'] = test_leak.loc[c,leak_target]

### 4. Combine leak and prediction: Machine learning is used to predict reliability of leaked rows

In [None]:
# Combine leak and prediction when whether a leak is correct is predicted by some machine learning algorithm
useful_test_leak = pd.read_csv('./useful_test_leak.csv',names=['correct_or_wrong']) # file useful_test_leak.csv contains reliability of each leaked row. 0 stands for not reliable. Otherwise 1
test_prediction = pd.read_csv('./xgb+lgb.csv')
test_prediction['change_needed'] = test_leak['compiled_leak'].apply(lambda x: 0 if np.isnan(x) else 1) # All leaked rows are marked by 1 first

# Leaked rows that are predicted to be unreliable are changed to 0
count = 0
for ind,row in test_prediction.iterrows():
    if row['change_needed'] == 1:
        test_prediction.loc[ind,'change_needed'] = useful_test_leak.loc[count,'correct_or_wrong']
        count += 1

print('Number of leaked rows: {}'.format(count))
print('Number of reliable leaked rows: {}'.format(sum(test_prediction['change_needed'] == 1)))
test_prediction.loc[test_prediction['change_needed'] == 1,'target'] = test_leak.loc[test_prediction['change_needed'] == 1,'compiled_leak']
test_prediction[['ID','target']].to_csv('only_use_reliable_leaks.csv',index=False)
