In [1]:
import logging
from sklearn import metrics
import multiprocessing as mp
from collections import Counter
import cv2 
# from databases import pull_pending, submit_result
# from datasets import load
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.over_sampling import RandomOverSampler as ROS
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import AllKNN as AKNN
from imblearn.under_sampling import ClusterCentroids as CC
from imblearn.under_sampling import CondensedNearestNeighbour as CNN
from imblearn.under_sampling import EditedNearestNeighbours as ENN
from imblearn.under_sampling import InstanceHardnessThreshold as IHT
from imblearn.under_sampling import RepeatedEditedNearestNeighbours as RENN
from imblearn.under_sampling import NearMiss as NM
from imblearn.under_sampling import NeighbourhoodCleaningRule as NCL
from imblearn.under_sampling import OneSidedSelection as OSS
from imblearn.under_sampling import RandomUnderSampler as RUS
from imblearn.under_sampling import TomekLinks as TL
# from rbo import RBO
# from rbu import RBU
from sklearn.naive_bayes import GaussianNB as NB
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.svm import SVC as SVM
from sklearn.tree import DecisionTreeClassifier as CART


In [4]:
from pandas import read_csv
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [5]:
import imblearn.metrics
import sklearn.metrics

from collections import Counter


def metric_decorator(metric_function):
    def metric_wrapper(ground_truth, predictions, minority_class=None):
        if minority_class is None:
            minority_class = Counter(ground_truth).most_common()[-1][0]

        return metric_function(ground_truth, predictions, minority_class)

    return metric_wrapper


@metric_decorator
def precision(ground_truth, predictions, minority_class=None):
    return sklearn.metrics.precision_score(ground_truth, predictions, pos_label=minority_class)


@metric_decorator
def recall(ground_truth, predictions, minority_class=None):
    return sklearn.metrics.recall_score(ground_truth, predictions, pos_label=minority_class)


@metric_decorator
def f_measure(ground_truth, predictions, minority_class=None):
    return sklearn.metrics.f1_score(ground_truth, predictions, pos_label=minority_class)


def g_mean(ground_truth, predictions):
    return imblearn.metrics.geometric_mean_score(ground_truth, predictions)


def auc(ground_truth, predictions):
    return sklearn.metrics.roc_auc_score(ground_truth, predictions)

In [48]:
import logging
import numpy as np
from itertools import product
from sklearn.model_selection import StratifiedKFold


class ResamplingCV:
    def __init__(self, algorithm, classifier, metrics=(f_measure, g_mean, auc), n=3, **kwargs):
        self.algorithm = algorithm
        self.classifier = classifier
        self.metrics = metrics
        self.n = n
        self.kwargs = kwargs

    def fit_sample(self, X, y):
        best_score = -np.inf
        best_parameters = None

        parameter_combinations = list((dict(zip(self.kwargs, x)) for x in product(*self.kwargs.values())))

        if len(parameter_combinations) == 1:
            return self.algorithm(**parameter_combinations[0]).fit_resample(X, y)

        for parameters in parameter_combinations:
            scores = []

            for _ in range(self.n):
                skf = StratifiedKFold(n_splits=2, shuffle=True)

                for train_idx, test_idx in skf.split(X, y):
                    try:
                        X_train, y_train = self.algorithm(**parameters).fit_resample(X.iloc[train_idx], y.iloc[train_idx])
                    except ValueError as e:
                        logging.warning('ValueError "%s" occurred during CV resampling with %s. '
                                        'Setting parameter score to -inf.' % (e, self.algorithm.__name__))

                        scores.append(-np.inf)

                        break
                    else:
                        if len(np.unique(y_train)) < 2:
                            logging.warning('One of the classes was eliminated during CV resampling with %s. '
                                            'Setting parameter score to -inf.' % self.algorithm.__name__)

                            scores.append(-np.inf)

                            break

#                         classifier = self.classifier.fit(X_train, y_train)
#                         predictions = classifier.predict(X[test_idx])

#                         scores.append(np.mean([metric(y[test_idx], predictions) for metric in self.metrics]))

#             score = np.mean(scores)

#             if score > best_score:
#                 best_score = score
#                 best_parameters = parameters

        return self.algorithm.fit_resample(X, y)

In [7]:
df=pd.read_csv('bs140513_032310.csv')
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df.loc[:,['customer']]=le.fit_transform(df.loc[:,['customer']])
df.loc[:,['age']]=le.fit_transform(df.loc[:,['age']])
df.loc[:,['zipcodeOri']]=le.fit_transform(df.loc[:,['zipcodeOri']])
df.loc[:,['zipMerchant']]=le.fit_transform(df.loc[:,['zipMerchant']])
df.loc[:,['merchant']]=le.fit_transform(df.loc[:,['merchant']])
df.loc[:,['category']]=le.fit_transform(df.loc[:,['category']])
df.loc[:,['gender']]=le.fit_transform(df.loc[:,['gender']])

  return f(*args, **kwargs)


In [8]:
train,test=train_test_split(df,random_state=42)

In [9]:
x_train=train.drop('fraud',axis=1)
y_train=train.fraud
xtest=test.drop('fraud',axis=1)
ytest=test.fraud

In [10]:
n_minority = Counter(y_train).most_common()[1][1]
n_majority = Counter(y_train).most_common()[0][1]

In [11]:
n_minority

5450

In [12]:
n_majority

440532

In [43]:
imblearn_ratios = [((n_majority - n_minority) * ratio + n_minority) / n_majority for ratio in [0.5, 0.75, 1.0]]
clf = {'NB': NB(),
       'KNN': KNN(),
       'SVM': SVM(gamma='scale'),
       'CART': CART()}

In [36]:
samtec=[ 'AKNN','CC','CNN','ENN','IHT','RENN','NM','NCL','OSS','RUS','TL']

In [28]:
algorithms = {
                'AKNN': ResamplingCV(
                    AKNN, clf,
                    n_neighbors=[1, 3, 5, 7]
                ),
                'Bord': ResamplingCV(
                    SMOTE, clf,
                    kind=['borderline1'],
                    k_neighbors=[1, 3, 5, 7, 9],
                    m_neighbors=[5, 10, 15],
                    sampling_strategy=imblearn_ratios
                ),
                'CC': ResamplingCV(
                    CC, clf,
                    sampling_strategy=imblearn_ratios
                ),
                'CNN': ResamplingCV(
                    CNN, clf,
                    n_neighbors=[1, 3, 5, 7]
                ),
                'ENN': ResamplingCV(
                    ENN, clf,
                    n_neighbors=[1, 3, 5, 7]
                ),
                'IHT': ResamplingCV(
                    IHT, clf,
                    sampling_strategy=imblearn_ratios,
                    cv=[2]
                ),
                'NCL': ResamplingCV(
                    NCL, clf,
                    n_neighbors=[1, 3, 5, 7]
                ),
                'NM': ResamplingCV(
                    NM, clf,
                    n_neighbors=[1, 3, 5, 7]
                ),
                'OSS': ResamplingCV(
                    OSS, clf,
                    n_neighbors=[1, 3, 5, 7]
                ),
                'RENN': ResamplingCV(
                    RENN, clf,
                    n_neighbors=[1, 3, 5, 7]
                ),
                'ROS': ResamplingCV(
                    ROS, clf,
                    sampling_strategy=imblearn_ratios
                ),
                'RUS': ResamplingCV(
                    RUS, clf,
                    sampling_strategy=imblearn_ratios
                ),
                'SMOTE': ResamplingCV(
                    SMOTE, clf,
                    k_neighbors=[1, 3, 5, 7, 9],
                    sampling_strategy=imblearn_ratios
                ),
                'SMOTE+ENN': ResamplingCV(
                    SMOTEENN, clf,
                    smote=[SMOTE(k_neighbors=k) for k in [1, 3, 5, 7, 9]],
                    sampling_strategy=imblearn_ratios
                ),
                'SMOTE+TL': ResamplingCV(
                    SMOTETomek, clf,
                    smote=[SMOTE(k_neighbors=k) for k in [1, 3, 5, 7, 9]],
                    sampling_strategy=imblearn_ratios
                ),
                'TL': TL(),
            }

In [29]:
from sklearn.preprocessing import MinMaxScaler
scaling = MinMaxScaler(feature_range=(-1,1))

In [53]:
 algorithm = algorithms.get('OSS')


In [46]:
aknn=AKNN()

In [54]:
 X_train, Y_train = algorithm.fit_sample(pd.DataFrame(scaling.fit_transform(x_train)), pd.DataFrame(y_train))

In [61]:
xt,yt,xtest,ytest=train_test_split( X_train, Y_train,random_state=42)

In [66]:
xtest['fraud'].value_counts()

0    327495
1      4081
Name: fraud, dtype: int64

In [59]:
len(df)

594643

In [44]:
clf['KNN']

KNeighborsClassifier()

Unnamed: 0,step,customer,age,gender,zipcodeOri,merchant,zipMerchant,category,amount
458422,143,134,2,1,0,18,0,12,40.54
102312,37,1251,1,1,0,18,0,12,38.63
253447,84,3973,2,1,0,18,0,12,61.62
585032,177,1503,3,1,0,45,0,3,41.24
136214,48,4074,2,1,0,30,0,12,4.74
...,...,...,...,...,...,...,...,...,...
110268,40,1981,5,1,0,30,0,12,38.12
259178,86,747,2,1,0,30,0,12,38.63
365838,117,104,6,2,0,18,0,12,0.01
131932,47,291,3,2,0,30,0,12,20.37
