In [1]:
import pandas as pd

In [35]:
data = pd.read_csv('datasets/german_credit_data_dataset.csv')
data.head()

Unnamed: 0,checking_account_status,duration,credit_history,purpose,credit_amount,savings,present_employment,installment_rate,personal,other_debtors,...,property,age,other_installment_plans,housing,existing_credits,job,dependents,telephone,foreign_worker,customer_type
0,A11,6,A34,A43,1169.0,A65,A75,4.0,A93,A101,...,A121,67.0,A143,A152,2.0,A173,1,A192,A201,1
1,A12,48,A32,A43,5951.0,A61,A73,2.0,A92,A101,...,A121,22.0,A143,A152,1.0,A173,1,A191,A201,2
2,A14,12,A34,A46,2096.0,A61,A74,2.0,A93,A101,...,A121,49.0,A143,A152,1.0,A172,2,A191,A201,1
3,A11,42,A32,A42,7882.0,A61,A74,2.0,A93,A103,...,A122,45.0,A143,A153,1.0,A173,2,A191,A201,1
4,A11,24,A33,A40,4870.0,A61,A73,3.0,A93,A101,...,A124,53.0,A143,A153,2.0,A173,2,A191,A201,2


##### Customer type is our target column : 
##### 1 : Creditworthy , 2 : Not Creditworthy

In [27]:
data.shape

(1000, 21)

In [28]:
data.columns

Index(['checking_account_status', 'duration', 'credit_history', 'purpose',
       'credit_amount', 'savings', 'present_employment', 'installment_rate',
       'personal', 'other_debtors', 'present_residence', 'property', 'age',
       'other_installment_plans', 'housing', 'existing_credits', 'job',
       'dependents', 'telephone', 'foreign_worker', 'customer_type'],
      dtype='object')

In [36]:
data = data.drop(['telephone', 'personal', 'present_residence', 'other_installment_plans'], axis=1)

In [30]:
data.shape

(1000, 17)

In [31]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
checking_account_status    1000 non-null object
duration                   1000 non-null int64
credit_history             1000 non-null object
purpose                    1000 non-null object
credit_amount              1000 non-null float64
savings                    1000 non-null object
present_employment         1000 non-null object
installment_rate           1000 non-null float64
other_debtors              1000 non-null object
property                   1000 non-null object
age                        1000 non-null float64
housing                    1000 non-null object
existing_credits           1000 non-null float64
job                        1000 non-null object
dependents                 1000 non-null int64
foreign_worker             1000 non-null object
customer_type              1000 non-null int64
dtypes: float64(4), int64(3), object(10)
memory usage: 132.9+ KB


In [37]:
data['savings'].unique()

array(['A65', 'A61', 'A63', 'A64', 'A62'], dtype=object)

In [38]:
from sklearn.preprocessing import LabelEncoder

savings_dict = {"A65" : 0, "A61" : 1, "A62" : 2, "A63" : 3, "A64" : 4}
data['savings'].replace(savings_dict, inplace=True)
data.head()

Unnamed: 0,checking_account_status,duration,credit_history,purpose,credit_amount,savings,present_employment,installment_rate,other_debtors,property,age,housing,existing_credits,job,dependents,foreign_worker,customer_type
0,A11,6,A34,A43,1169.0,0,A75,4.0,A101,A121,67.0,A152,2.0,A173,1,A201,1
1,A12,48,A32,A43,5951.0,1,A73,2.0,A101,A121,22.0,A152,1.0,A173,1,A201,2
2,A14,12,A34,A46,2096.0,1,A74,2.0,A101,A121,49.0,A152,1.0,A172,2,A201,1
3,A11,42,A32,A42,7882.0,1,A74,2.0,A103,A122,45.0,A153,1.0,A173,2,A201,1
4,A11,24,A33,A40,4870.0,1,A73,3.0,A101,A124,53.0,A153,2.0,A173,2,A201,2


In [39]:
data = pd.get_dummies(data, columns=['checking_account_status',
                                     'credit_history',
                                     'purpose',
                                     'present_employment',
                                     'property',
                                     'housing',
                                     'other_debtors',
                                     'job',
                                     'foreign_worker'])
data.shape

(1000, 48)

In [40]:
data.sample(2)

Unnamed: 0,duration,credit_amount,savings,installment_rate,age,existing_credits,dependents,customer_type,checking_account_status_A11,checking_account_status_A12,...,housing_A153,other_debtors_A101,other_debtors_A102,other_debtors_A103,job_A171,job_A172,job_A173,job_A174,foreign_worker_A201,foreign_worker_A202
715,30,7596.0,0,1.0,63.0,2.0,1,1,0,0,...,0,1,0,0,0,0,1,0,1,0
161,18,1055.0,1,4.0,30.0,2.0,1,1,0,0,...,0,1,0,0,0,0,1,0,1,0


In [41]:
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [51]:
def naive_bayes(x_train, y_train):
    print("Gaussian NB")
    clf = GaussianNB()
    clf.fit(x_train, y_train)
    
    return clf

In [57]:
def k_nearest_neighbors(x_train, y_train):
    print("K nearest neighbors")
    clf = KNeighborsClassifier(n_neighbors=9)
    clf.fit(x_train, y_train)
    
    return clf

In [61]:
def svc(x_train, y_train):
    print("SVC")
    clf = SVC(kernel='rbf', gamma='scale')
    clf.fit(x_train, y_train)
    
    return clf

In [52]:
def decision_tree(x_train, y_train):
    print("Decision Tree")
    clf = DecisionTreeClassifier(max_depth=6)
    clf.fit(x_train, y_train)
    
    return clf

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [50]:
X = data.drop('customer_type', axis=1)
Y = data['customer_type']

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [53]:
def build_and_train_model(x_train, y_train, classification_fnc):
    
    model = classification_fnc(x_train, y_train)
    y_pred = model.predict(x_test)
    
    train_score = model.score(x_train, y_train)
    test_score = accuracy_score(y_test, y_pred)
    
    print("Training Score : ", train_score)
    print("Testing Score : ", test_score)    

In [55]:
build_and_train_model(x_train, y_train, naive_bayes)

Gaussian NB
Training Score :  0.76
Testing Score :  0.74


In [58]:
build_and_train_model(x_train, y_train, k_nearest_neighbors)

K nearest neighbors
Training Score :  0.7275
Testing Score :  0.685


In [62]:
build_and_train_model(x_train, y_train, svc)

SVC
Training Score :  0.71125
Testing Score :  0.7


In [63]:
build_and_train_model(x_train, y_train, decision_tree)

Decision Tree
Training Score :  0.8225
Testing Score :  0.7


In [64]:
x_train_1, x_train_2, y_train_1, y_train_2 = train_test_split(X, Y, test_size=0.5)

In [65]:
rfc = RandomForestClassifier(max_depth=4, n_estimators=2, warm_start=True)

In [66]:
rfc.fit(x_train_1, y_train_1)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=4, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=2,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=True)

In [73]:
y_pred = rfc.predict(x_test)

In [74]:
test_score = accurcy = accuracy_score(y_test, y_pred)
print("Testing Score : ", test_score)

Testing Score :  0.745


In [75]:
rfc.n_estimators += 2
rfc.fit(x_train_2, y_train_2)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=4, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=6,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=True)

In [77]:
test_score = accuracy = accuracy_score(y_test, y_pred)
print("Testing score is : ", test_score)

Testing score is :  0.745
