## Fitting a Logistic Regression Model on Credit Card Data

## Loading, Exploring, and Cleaning the Data

In [1]:
import pandas as pd

In [2]:
credData = pd.read_csv('crx-data.csv', sep=',', header=None, na_values='?')
credData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [3]:
# changing the Classes to 1 and 0
credData.loc[credData[15] == '+', 15] = 1
credData.loc[credData[15] == '-', 15] = 0
credData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [4]:
# finding number of null values
credData.isnull().sum()

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64

In [5]:
# printing shape and data types
print('Shape of raw data set: {}'.format(credData.shape))
print('Data types of data set: {}'.format(credData.dtypes))

Shape of raw data set: (690, 16)
Data types of data set: 0      object
1     float64
2     float64
3      object
4      object
5      object
6      object
7     float64
8      object
9      object
10      int64
11     object
12     object
13    float64
14      int64
15     object
dtype: object


In [6]:
# dropping all rows with na values
newcred = credData.dropna(axis=0)
newcred.shape

(653, 16)

In [7]:
# verify that no null values exist
newcred.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64

In [8]:
# dummy variables for categorical variables
credCat = pd.get_dummies(newcred[[0,3,4,5,6,9,11,12]])

In [9]:
# separating numerical variables
credNum = newcred[[1,2,7,10,13,14]]

In [10]:
# making X variable
X = pd.concat([credCat,credNum],axis = 1)
print(X.shape)

# making y variable
y = pd.Series(newcred[15], dtype='int')
print(y.shape)

(653, 44)
(653,)


In [11]:
# normalizing the data sets
from sklearn.preprocessing import MinMaxScaler

In [12]:
minmaxScaler = MinMaxScaler()
X_tran = pd.DataFrame(minmaxScaler.fit_transform(X))

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
# split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tran, y, test_size=0.3, random_state=123)

In [15]:
from sklearn.linear_model import LogisticRegression

In [16]:
# fitting logistic regression model on training set
benchModel = LogisticRegression()
benchModel.fit(X_train, y_train)

# prediction on test set
bench_pred = benchModel.predict(X_test)

In [17]:
from sklearn.metrics import classification_report, confusion_matrix

In [18]:
# confusion matrix
bench_conf_matrix = confusion_matrix(y_test, bench_pred)
print(bench_conf_matrix)

# classification report
bench_class_report = classification_report(y_test, bench_pred)
print(bench_class_report)

[[89 18]
 [28 61]]
              precision    recall  f1-score   support

           0       0.76      0.83      0.79       107
           1       0.77      0.69      0.73        89

    accuracy                           0.77       196
   macro avg       0.77      0.76      0.76       196
weighted avg       0.77      0.77      0.76       196

