In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('creditcard.csv')

In [3]:
data[data['Class'] == 1].shape[0] * 1./ data.shape[0]

0.001727485630620034

In [4]:
mean = data['Amount'].mean()
std = data['Amount'].std()
data['Amount'] = (data['Amount'] - mean) / std

In [5]:
X = data.drop('Class', axis=1)
y = data['Class']

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
import random

In [66]:
def analysis(clf, X,  y):
    record = pd.DataFrame()
    record['y'] = y
    record['pred'] = clf.predict(X)
    acc = accuracy_score(record['y'], record['pred'])
    precision = precision_score(record['y'], record['pred'])
    recall = recall_score(record['y'], record['pred'])
#     print "precision_score {}".format(precision)
#     print "recall_score {}".format(recall)
    return recall, precision

In [49]:
def model_score(clf, X, y):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=random.randint(1, 100))
    my_scorer1 = make_scorer(recall_score)
    my_scorer2 = make_scorer(precision_score)

    scores1 = cross_val_score(clf, X, y, scoring=my_scorer1, cv=skf)
    scores2 = cross_val_score(clf, X, y, scoring=my_scorer2, cv=skf)
    
    print "recall {}".format(scores1.mean())
    print "precision {}".format(scores2.mean())

    return 

In [37]:
clf = LogisticRegression(penalty='l2', intercept_scaling=1., random_state=random.randint(1, 100))
print "default"
model_score(clf, X, y)

default
recall 0.654524840239
precision 0.734290959718


In [39]:
# 本质上还是over-smalping
clf = LogisticRegression(penalty='l2', class_weight='balanced',intercept_scaling=1., random_state=random.randint(1, 100))
print "balanced"
model_score(clf, X, y)

balanced
recall 0.904452690167
precision 0.0659131368346


In [61]:
clf = LogisticRegression(penalty='l2', intercept_scaling=1., random_state=random.randint(1, 100))
skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=random.randint(1, 100))
recall_scores = []
precision_scores = []
scaling = float(y.shape[0] - y.sum()) / y.shape[0]
for train_index, test_index in skf.split(X, y):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    clf.fit(X_train, y_train)
    predict_proba = clf.predict_proba(X_test)
    predict = (predict_proba[:, 0] < scaling).astype(int)
    recall_scores.append(recall_score(y_test, predict))
    precision_scores.append(precision_score(y_test, predict))
print "threshold {}".format(scaling)
print "recall {}".format(np.array(recall_scores).mean())
print "precision {}".format(np.array(precision_scores).mean())

threshold 0.998272514369
recall 0.855691056911
precision 0.00845923096711


In [None]:
#欠抽样 https://www.kaggle.com/joparga3/d/dalpozz/creditcardfraud/in-depth-skewed-data-classif-93-recall-acc-now
#recall 0.9115646258503401
#precision 0.011938702779757662

In [18]:
def SMOTE_over_sampling(X, y):
    from imblearn.over_sampling import SMOTE
    sm = SMOTE(kind='regular')
#     sm = SMOTE(kind='svm')
    X_resampled, y_resampled = sm.fit_sample(X, y)
    return X_resampled, y_resampled

In [69]:
def train(X, y):
    print "over-smalping: SMOTE"
    clf = LogisticRegression(penalty='l2', intercept_scaling=1., random_state=random.randint(1, 100))
    skf = StratifiedKFold(n_splits=2, shuffle=True, random_state=random.randint(1, 100))
    recall_scores = []
    precision_scores = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.loc[train_index], X.loc[test_index]
        y_train, y_test = y.loc[train_index], y.loc[test_index]
        X_resampled, y_resampled = SMOTE_over_sampling(X_train, y_train)
        clf.fit(X_resampled, y_resampled)
        recall_score, precision_socre = analysis(clf, X_test, y_test) 
        recall_scores.append(recall_score)
        precision_scores.append(precision_socre)
    print "recall {}".format(np.array(recall_scores).mean())
    print "precision {}".format(np.array(precision_scores).mean())
    return clf
clf = train(X, y)

over-smalping: SMOTE
recall 0.865853658537
precision 0.111328295814
