In [265]:
import pandas as pd
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection, preprocessing
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [266]:
df = pd.read_csv("data/Credit.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
0,1,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333
1,2,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903
2,3,104.593,7075,514,4,71,11,Male,No,No,Asian,580
3,4,148.924,9504,681,3,36,11,Female,No,No,Asian,964
4,5,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331


In [267]:
df.drop('Unnamed: 0', axis=1, inplace=True)
df.head()

Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
0,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333
1,106.025,6645,483,3,82,15,Female,Yes,Yes,Asian,903
2,104.593,7075,514,4,71,11,Male,No,No,Asian,580
3,148.924,9504,681,3,36,11,Female,No,No,Asian,964
4,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331


In [268]:
df.dtypes

Income       float64
Limit          int64
Rating         int64
Cards          int64
Age            int64
Education      int64
Gender        object
Student       object
Married       object
Ethnicity     object
Balance        int64
dtype: object

In [269]:
df = df.astype({ 'Gender': 'category', 'Student': 'category', 'Married': 'category', 'Ethnicity': 'category' })
df.dtypes

Income        float64
Limit           int64
Rating          int64
Cards           int64
Age             int64
Education       int64
Gender       category
Student      category
Married      category
Ethnicity    category
Balance         int64
dtype: object

---

**Logistic regression**

In [270]:
# (hint: create new indicator variable)
df['GreatIncome'] = df.apply(lambda row: 1 if row['Income'] > 50 else 0, axis=1)

In [271]:
def get_model(df_, formula, model = LogisticRegression(), prnt = True):
    y, x = dmatrices(formula, data=df_, return_type='dataframe')
    Y = y.values.ravel()
    # X = x.values
    X = preprocessing.StandardScaler().fit(x.values).transform(x.values)

    # Cross-validation code
    seed = 1
    scoring = 'accuracy'
    kfold = model_selection.KFold(n_splits=10, shuffle=True, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)

    if prnt:    
        msg = "mean: %f       std: %f" % (cv_results.mean(), cv_results.std())
        print(msg)

    return cv_results.mean(), cv_results.std()

In [272]:
formula_a = "GreatIncome ~ Limit + Rating + Cards + Age + Education + Gender + Student + Married + Ethnicity + Balance"
formula_b = "Cards ~ Income + Limit + Rating + Age + Education + Gender + Student + Married + Ethnicity + Balance"
formula_a2 = "GreatIncome ~ Limit + Student + Balance"

In [273]:
get_model(df, formula_a)

mean: 0.942500       std: 0.047500


(0.9425000000000001, 0.0475)

In [274]:
get_model(df, formula_b)

mean: 0.225000       std: 0.061237


(0.225, 0.06123724356957945)

---
KNN

In [275]:
mean_a = 0
mean_b = 0
std_a = -1
std_b = -1
i_a = 0
i_b = 0

for i in range(1, 30):
    model_ = KNeighborsClassifier(n_neighbors=i)
    _mean_a, _std_a = get_model(df, formula_a2, model_, False)
    _mean_b, _std_b = get_model(df, formula_b, model_, False)
    if std_a < 0 or std_a > _std_a:
        mean_a = _mean_a
        std_a = _std_a
        i_a = i
    if std_b < 0 or std_b > _std_b:
        mean_b = _mean_b
        std_b = _std_b
        i_b = i

print(i_a, mean_a, std_a)            
print(i_b, mean_b, std_b)            

5 0.9324999999999999 0.02249999999999997
9 0.25000000000000006 0.04031128874149276


---
DT

In [276]:
mean_a = 0
mean_b = 0
std_a = -1
std_b = -1
i_a = 0
i_b = 0

for i in range(1, 12):
    model_ = DecisionTreeClassifier(max_depth=i)
    _mean_a, _std_a = get_model(df, formula_a, model_, False)
    _mean_b, _std_b = get_model(df, formula_b, model_, False)
    if std_a < 0 or std_a > _std_a:
        mean_a = _mean_a
        std_a = _std_a
        i_a = i
    if std_b < 0 or std_b > _std_b:
        mean_b = _mean_b
        std_b = _std_b
        i_b = i

print(i_a, mean_a, std_a)            
print(i_b, mean_b, std_b) 

11 0.9 0.022360679774997894
7 0.265 0.04499999999999999
