In [2]:
import pandas as pd

# https://www.kaggle.com/dalpozz/test-different-sampling-techniques/data
data = pd.read_csv('creditcard.csv')

# show the contents
print(data)

            Time        V1        V2  ...       V28  Amount  Class
0            0.0 -1.359807 -0.072781  ... -0.021053  149.62    0.0
1            0.0  1.191857  0.266151  ...  0.014724    2.69    0.0
2            1.0 -1.358354 -1.340163  ... -0.059752  378.66    0.0
3            1.0 -0.966272 -0.185226  ...  0.061458  123.50    0.0
4            2.0 -1.158233  0.877737  ...  0.215153   69.99    0.0
5            2.0 -0.425966  0.960523  ...  0.081080    3.67    0.0
6            4.0  1.229658  0.141004  ...  0.005168    4.99    0.0
7            7.0 -0.644269  1.417964  ... -1.085339   40.80    0.0
8            7.0 -0.894286  0.286157  ...  0.142404   93.20    0.0
9            9.0 -0.338262  1.119593  ...  0.083076    3.68    0.0
10          10.0  1.449044 -1.176339  ...  0.016253    7.80    0.0
11          10.0  0.384978  0.616109  ... -0.054337    9.99    0.0
12          10.0  1.249999 -1.221637  ...  0.042422  121.50    0.0
13          11.0  1.069374  0.287722  ...  0.021293   27.50   

In [0]:
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

In [0]:
# Only use the 'Amount' and 'V1', ..., 'V28' features
features = ['Amount'] + ['V%d' % number for number in range(1, 29)]

# The target variable which we would like to predict, is the 'Class' variable
target = 'Class'

# Now create an X variable (containing the features) and an y variable (containing only the target variable)
X = data[features]
y = data[target]

In [0]:
def normalize(X):
  """
    Make the distribution of the values of each variable similar by subtracting 
    the mean and by dividing by the standard deviation.
  """
  
  for feature in X.columns:
    X[feature] -= X[feature].mean()
    X[feature] /= X[feature].std()
  return X

In [0]:

model = LogisticRegression()

# Define the splitter for splitting the data in a train set and a test set
splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=0)

# Loop through the splits (only one)
for train_indices, test_indices in splitter.split(X, y):
    # Select the train and test data
    X_train, y_train = X.iloc[train_indices], y.iloc[train_indices]
    X_test, y_test = X.iloc[test_indices], y.iloc[test_indices]
    
    # Normalize the data
    X_train = normalize(X_train)
    X_test = normalize(X_test)
    
    # Fit and predict!
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # And finally: show the results
    print(classification_report(y_test, y_pred))