In [3]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import pickle 

In [4]:
df = pd.read_csv("/home/onyeogulu/www/node/mlbookcamp-code/AER_credit_card_data.csv")

In [5]:
df.head()

Unnamed: 0,card,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
0,yes,0,37.66667,4.52,0.03327,124.9833,yes,no,3,54,1,12
1,yes,0,33.25,2.42,0.005217,9.854167,no,no,3,34,1,13
2,yes,0,33.66667,4.5,0.004156,15.0,yes,no,4,58,1,5
3,yes,0,30.5,2.54,0.065214,137.8692,no,no,0,25,1,7
4,yes,0,32.16667,9.7867,0.067051,546.5033,yes,no,2,64,1,5


In [6]:
df.card.value_counts()

yes    1023
no      296
Name: card, dtype: int64

In [7]:
df.card = (df.card == 'yes').astype(int)

In [8]:
len(df)

1319

In [9]:
df.isnull().sum()

card           0
reports        0
age            0
income         0
share          0
expenditure    0
owner          0
selfemp        0
dependents     0
months         0
majorcards     0
active         0
dtype: int64

In [10]:
df.dtypes

card             int64
reports          int64
age            float64
income         float64
share          float64
expenditure    float64
owner           object
selfemp         object
dependents       int64
months           int64
majorcards       int64
active           int64
dtype: object

In [11]:
categorical = ['owner', 'selfemp']
numerical = ['dependents', 'months', 'majorcards', 'active', 'age', 'income', 'share', 'expenditure']

In [12]:
df[categorical].nunique()

owner      2
selfemp    2
dtype: int64

In [13]:
df[numerical].mean()

dependents       0.993935
months          55.267627
majorcards       0.817286
active           6.996967
age             33.213103
income           3.365376
share            0.068732
expenditure    185.057071
dtype: float64

In [14]:
df_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

In [15]:
for i in range(1):
    print(len(df_train))
    print(len(df_test))

1055
264


In [16]:
y_train = df_train.card.values
y_test = df_test.card.values

In [17]:
del df_train['card']
del df_test['card']

In [43]:
def train(df, y, C=1.0):
    cat = df.to_dict(orient='rows')
    
    dv = DictVectorizer(sparse=False)
    dv.fit(cat)

    X = dv.transform(cat)

    model = LogisticRegression(solver='liblinear', C=C)
    model.fit(X, y)

    return dv, model


def predict(df, dv, model):
    cat = df.to_dict(orient='rows')
    
    X = dv.transform(cat)

    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

In [44]:
dv, model = train(df_train, y_train, C=1.0)
y_pred = predict(df_test, dv, model)

auc = roc_auc_score(y_test, y_pred)
print('auc = %.3f' % auc)

auc = 0.996


  cat = df.to_dict(orient='rows')
  cat = df.to_dict(orient='rows')


In [46]:
card = y_pred >= 0.5
(card == y_test).mean()

0.9886363636363636

In [48]:

with open('card-model.bin', 'wb') as f_out:
    pickle.dump((dv, model), f_out)

In [23]:
def predict(df, dv, model):
    cat = df.to_dict(orient='rows')
    
    X = dv.transform(cat)

    y_pred = model.predict_proba(X)[:, 1]

    return y_pred


with open('card-model.bin', 'rb') as f_in:
    dv, model = pickle.load(f_in)

In [26]:
prediction = predict(df_test, dv, model)

card = prediction >= 0.5
(card == y_test).mean()

  cat = df.to_dict(orient='rows')


0.9886363636363636

# Testing Webservice


In [18]:
df_test.head()

Unnamed: 0,reports,age,income,share,expenditure,owner,selfemp,dependents,months,majorcards,active
288,0,22.33333,1.8,0.000667,0.0,no,no,0,3,0,6
190,0,24.16667,2.8896,0.080178,193.0675,no,no,0,12,0,0
852,0,24.5,2.304,0.374743,719.5059,no,no,0,76,1,2
596,0,40.41667,3.115,0.076759,199.0042,yes,no,1,60,1,11
186,0,30.25,2.55,0.035322,74.72667,yes,no,1,11,1,0


In [34]:
df = df_test.iloc[0, :].to_json()

In [48]:
df = json.loads(df)

{'card': False, 'card_probability': 0.18948898745040224}