In [131]:
import numpy as np
import pandas as pd
import seaborn as sns
sns.set(context="notebook", palette="Spectral", style = 'darkgrid' ,font_scale = 1.5, color_codes=True)
import warnings
warnings.filterwarnings('ignore')
import os
import matplotlib.pyplot as plt
from scipy import stats
n_rows_to_load = 100000

In [132]:
hist_transactions = pd.read_csv("/mnt/c/Users/Anton/Desktop/BU425/Project/bu425-large-data-cluster-force/data/historical_transactions.csv", index_col='card_id', nrows=n_rows_to_load)
merchants = pd.read_csv("/mnt/c/Users/Anton/Desktop/BU425/Project/bu425-large-data-cluster-force/data/merchants.csv", index_col='merchant_id', nrows=n_rows_to_load)
train_df = pd.read_csv("/mnt/c/Users/Anton/Desktop/BU425/Project/bu425-large-data-cluster-force/data/train.csv", index_col='card_id', nrows=n_rows_to_load)

# For filtering out outliers to 3 standard deviations (-3, 3), corresponding to 99% interval
z_threshold = 3
def z_score(val, mean, std):
    if val == float('inf'): return float('inf')
    return (val - mean)/std

In [133]:
# Creating z-scores columns for hist transactions columns (month_lag, purchase_amount, installments)

mean = hist_transactions.month_lag.mean()
std = hist_transactions.month_lag.std()
hist_transactions['month_lag_z'] = hist_transactions.apply(lambda row: (row.month_lag - mean)/std, axis=1)

mean = hist_transactions.purchase_amount.mean()
std = hist_transactions.purchase_amount.std()
hist_transactions['purchase_amount_z'] = hist_transactions.apply(lambda row: (row.purchase_amount - mean)/std, axis=1)

mean = hist_transactions.installments.mean()
std = hist_transactions.installments.std()
hist_transactions['installments_z'] = hist_transactions.apply(lambda row: (row.installments - mean)/std, axis=1)

In [134]:
# Finding outliers with z-scores higher than threshold

transactions_outlier_IDs = hist_transactions.loc[(hist_transactions['month_lag_z'] > threshold) | (hist_transactions['month_lag_z'] < threshold*-1) |
                     (hist_transactions['purchase_amount_z'] > threshold) | (hist_transactions['purchase_amount_z'] < threshold*-1) |
                     (hist_transactions['installments_z'] > threshold) | (hist_transactions['installments_z'] < threshold*-1)].reset_index()[['card_id']]

In [135]:
# Creating z-scores columns for target column
mean = train_df.target.mean()
std = train_df.target.std()
train_df['target_z'] = train_df.apply(lambda row: (row.target - mean)/std, axis=1)
# Finding outliers with z-scores higher than threshold
training_target_outlier_IDs = train_df.loc[(train_df['target_z'] > threshold) | (train_df['target_z'] < threshold*-1)].reset_index()[['card_id']]

# outliers = detect_outlier(train_df['target'])
# print(len(outliers), outliers[0:8])
#train_df.loc[(train_df['target_z'] > threshold) | (train_df['target_z'] < threshold*-1)].groupby('target').count()
# train_df.groupby('target').count().sort_index(ascending=True)

In [136]:
# Creating z-scores columns for merchants columns
columns = ['numerical_1', 'numerical_2', 'avg_sales_lag3', 'avg_purchases_lag3', 'active_months_lag3', 'avg_sales_lag6', 'avg_purchases_lag6', 'active_months_lag6', 'avg_sales_lag12', 'avg_purchases_lag12', 'active_months_lag12']
for col in columns:
    mean = np.ma.masked_invalid(merchants[col]).mean()
    std = np.ma.masked_invalid(merchants[col]).std()
    merchants[col+'_z'] = merchants.apply(lambda row: z_score(row[col], mean, std), axis=1)

#     For testing
#     outliers = detect_outlier(merchants['card_id', col])
#     print(col)
#     print(len(outliers), outliers[0:8])
#     print()

In [138]:
# Saving outliers with z-scores higher than threshold
merchants_outlier_IDs = merchants.loc[(merchants['numerical_1_z'] > threshold) | (merchants['numerical_1_z'] < threshold*-1) | 
              (merchants['numerical_2_z'] > threshold) | (merchants['numerical_2_z'] < threshold*-1) | 
              (merchants['avg_sales_lag3_z'] > threshold) | (merchants['avg_sales_lag3_z'] < threshold*-1) | 
              (merchants['avg_purchases_lag3_z'] > threshold) | (merchants['avg_purchases_lag3_z'] < threshold*-1) | 
              (merchants['active_months_lag3_z'] > threshold) | (merchants['active_months_lag3_z'] < threshold*-1) | 
              (merchants['avg_sales_lag6_z'] > threshold) | (merchants['avg_sales_lag6_z'] < threshold*-1) | 
              (merchants['avg_purchases_lag6_z'] > threshold) | (merchants['avg_purchases_lag6_z'] < threshold*-1) | 
              (merchants['active_months_lag6_z'] > threshold) | (merchants['active_months_lag6_z'] < threshold*-1) | 
              (merchants['avg_sales_lag12_z'] > threshold) | (merchants['avg_sales_lag12_z'] < threshold*-1) | 
              (merchants['avg_purchases_lag12_z'] > threshold) | (merchants['avg_purchases_lag12_z'] < threshold*-1) | 
              (merchants['active_months_lag12_z'] > threshold) | (merchants['active_months_lag12_z'] < threshold*-1)               
             ].reset_index()[['merchant_id']]

In [141]:
# Saving outliers to files
transactions_outlier_IDs.to_csv('transactions_outlier_IDs.csv')
training_target_outlier_IDs.to_csv('training_target_outlier_IDs.csv')
merchants_outlier_IDs.to_csv('merchants_outlier_IDs.csv')