In [166]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import pickle

In [76]:
df=pd.read_csv('german_credit_data.csv')

In [77]:
df.head()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,0,67,male,2,own,,little,1169,6,radio/TV,good
1,1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,2,49,male,1,own,little,,2096,12,education,good
3,3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,4,53,male,2,free,little,little,4870,24,car,bad


In [78]:
df.tail()

Unnamed: 0.1,Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
995,995,31,female,1,own,little,,1736,12,furniture/equipment,good
996,996,40,male,3,own,little,little,3857,30,car,good
997,997,38,male,2,own,little,,804,12,radio/TV,good
998,998,23,male,2,free,little,little,1845,45,radio/TV,bad
999,999,27,male,2,own,moderate,moderate,4576,45,car,good


In [79]:
df['Risk'].value_counts()

Risk
good    700
bad     300
Name: count, dtype: int64

In [80]:
df.isna().sum()

Unnamed: 0            0
Age                   0
Sex                   0
Job                   0
Housing               0
Saving accounts     183
Checking account    394
Credit amount         0
Duration              0
Purpose               0
Risk                  0
dtype: int64

In [81]:
df.fillna('Unknown',inplace=True)

In [82]:
df.isna().sum()

Unnamed: 0          0
Age                 0
Sex                 0
Job                 0
Housing             0
Saving accounts     0
Checking account    0
Credit amount       0
Duration            0
Purpose             0
Risk                0
dtype: int64

In [83]:
df.shape

(1000, 11)

In [84]:
for i in df.columns:
    print(f'No of unique value in {i}: {df[i].nunique()}')

No of unique value in Unnamed: 0: 1000
No of unique value in Age: 53
No of unique value in Sex: 2
No of unique value in Job: 4
No of unique value in Housing: 3
No of unique value in Saving accounts: 5
No of unique value in Checking account: 4
No of unique value in Credit amount: 921
No of unique value in Duration: 33
No of unique value in Purpose: 8
No of unique value in Risk: 2


In [85]:
df.drop(columns=['Unnamed: 0'],inplace=True)

In [86]:
df.columns

Index(['Age', 'Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account',
       'Credit amount', 'Duration', 'Purpose', 'Risk'],
      dtype='object')

In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Age               1000 non-null   int64 
 1   Sex               1000 non-null   object
 2   Job               1000 non-null   int64 
 3   Housing           1000 non-null   object
 4   Saving accounts   1000 non-null   object
 5   Checking account  1000 non-null   object
 6   Credit amount     1000 non-null   int64 
 7   Duration          1000 non-null   int64 
 8   Purpose           1000 non-null   object
 9   Risk              1000 non-null   object
dtypes: int64(4), object(6)
memory usage: 78.3+ KB


In [142]:
df.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,Unknown,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,Unknown,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [143]:
x=df.drop(columns=['Risk'])
y=df['Risk']

In [144]:
le_y=LabelEncoder()
y_encoded=le_y.fit_transform(y)

In [145]:
x_trian,x_test,y_train,y_test=train_test_split(x,y_encoded ,test_size=0.3,random_state=100)

In [146]:
numeric_col=df.select_dtypes(include=['number']).drop(columns=['Job']).columns
ohe_col=['Purpose']
Ord_col=df.select_dtypes(include=['category','object']).drop(columns=['Risk']).columns
print(numeric_col,ohe_col,Ord_col)

Index(['Age', 'Credit amount', 'Duration'], dtype='object') ['Purpose'] Index(['Sex', 'Housing', 'Saving accounts', 'Checking account', 'Purpose'], dtype='object')


In [147]:
preprocessor=ColumnTransformer([
    ('num', StandardScaler(), numeric_col),
    ('Onehot', OneHotEncoder(handle_unknown='ignore',sparse_output=False), ohe_col),
    ('Label', OrdinalEncoder(handle_unknown='use_encoded_value',unknown_value=-1),Ord_col)
])

In [160]:
log_reg_pipeline=Pipeline([
    ('Preprocessor', preprocessor),
    ('pca', PCA(n_components=5)),
    ('classifier', LogisticRegression())
])

In [161]:
log_reg_pipeline.fit(x_trian,y_train)

In [162]:
log_reg_pipeline.score(x_trian,y_train)

0.7128571428571429

In [163]:
y_pred=log_reg_pipeline.predict(x_test)

In [164]:
metrics.accuracy_score(y_test,y_pred)

0.6833333333333333

In [165]:
pipeline_dict={
    'Pipeline': log_reg_pipeline,
    'Label_Encoder_y':le_y
}

In [167]:
with open('logreg_pipeline.pkl','wb') as f:
    pickle.dump(pipeline_dict,f)
    