# Contexto

Trabajas en la división de tarjetas de crédito de su banco. El jefe de operaciones de la empresa ha solicitado su ayuda para determinar si un cliente es solvente o no. Se le han proporcionado datos de las operaciones con tarjeta de crédito.

Este conjunto de datos contiene solicitudes de tarjetas de crédito con alrededor de 15 variables. Las variables son una combinación de datos continuos y categóricos relacionados con dichas solicitudes. La etiqueta para el conjunto de datos es una bandera, que indica si la solicitud ha sido aprobado o no.

Desea ajustar algunos modelos de referencia y probar algunos métodos de aprendizaje en ensamble en el conjunto de datos para abordar el problema y crear una herramienta para predecir si un cliente determinado debe o no ser aprobado para su solicitud de crédito.

## Loading, Exploring, and Cleaning the Data

In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [6]:
#Loading data from the Github repository to colab notebook
filename = 'Datos/crx.data'

In [7]:
# Loading the data using pandas
credData = pd.read_csv(filename,sep=",",header = None,na_values = "?")
credData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [9]:
credData.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,f,g,260.0,0,-
686,a,22.67,0.75,u,g,c,v,2.0,f,t,2,t,g,200.0,394,-
687,a,25.25,13.5,y,p,ff,ff,2.0,f,t,1,t,g,200.0,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,f,g,280.0,750,-
689,b,35.0,3.375,u,g,c,h,8.29,f,f,0,t,g,0.0,0,-


In [10]:
# Changing the Classes to 1 & 0
credData.loc[credData[15] == '+' , 15] = 1
credData.loc[credData[15] == '-' , 15] = 0
credData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [11]:
# Finding number of null values in the data set
credData.isnull().sum()

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64

In [12]:
# Printing Shape and data types
print('Shape of raw data set',credData.shape)
print('Data types of data set',credData.dtypes)

Shape of raw data set (690, 16)
Data types of data set 0      object
1     float64
2     float64
3      object
4      object
5      object
6      object
7     float64
8      object
9      object
10      int64
11     object
12     object
13    float64
14      int64
15     object
dtype: object


In [13]:
# Dropping all the rows with na values
newcred = credData.dropna(axis = 0)
newcred.shape

(653, 16)

In [14]:
# Verifying no null values exist
newcred.isnull().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
dtype: int64

In [15]:
# Seperating the categorical variables to make dummy variables
credCat = pd.get_dummies(newcred[[0,3,4,5,6,8,9,11,12]])
credCat.head()

Unnamed: 0,0_a,0_b,3_l,3_u,3_y,4_g,4_gg,4_p,5_aa,5_c,...,6_z,8_f,8_t,9_f,9_t,11_f,11_t,12_g,12_p,12_s
0,0,1,0,1,0,1,0,0,0,0,...,0,0,1,0,1,1,0,1,0,0
1,1,0,0,1,0,1,0,0,0,0,...,0,0,1,0,1,1,0,1,0,0
2,1,0,0,1,0,1,0,0,0,0,...,0,0,1,1,0,1,0,1,0,0
3,0,1,0,1,0,1,0,0,0,0,...,0,0,1,0,1,0,1,1,0,0
4,0,1,0,1,0,1,0,0,0,0,...,0,0,1,1,0,1,0,0,0,1


In [18]:
# Seperating the numerical variables
credNum = newcred[[1,2,7,10,13,14]]
credNum.head()

Unnamed: 0,1,2,7,10,13,14
0,30.83,0.0,1.25,1,202.0,0
1,58.67,4.46,3.04,6,43.0,560
2,24.5,0.5,1.5,0,280.0,824
3,27.83,1.54,3.75,5,100.0,3
4,20.17,5.625,1.71,0,120.0,0


In [20]:
# Making the X variable which is a concatenation of categorical and numerical data
X = pd.concat([credCat,credNum],axis = 1)
print(X.shape)

# Seperating the label as y variable
y = pd.Series(newcred[15], dtype="int")
print(y.shape)

(653, 46)
(653,)


In [21]:
# Normalising the data sets
# Import library function
from sklearn import preprocessing
# Creating the scaling function
minmaxScaler = preprocessing.MinMaxScaler()
# Transforming with the scaler function
X_tran = pd.DataFrame(minmaxScaler.fit_transform(X))

In [22]:
from sklearn.model_selection import train_test_split

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_tran, y, test_size=0.3, random_state=123)

## Regresión logistica

In [23]:
from sklearn.linear_model import LogisticRegression
logisticModel = LogisticRegression()
logisticModel.fit(X_train, y_train)

In [24]:
pred = logisticModel.predict(X_test)
print('Accuaracy of Logistic regression model prediction on test set: {:.2f}'.format(logisticModel.score(X_test, y_test)))

Accuaracy of Logistic regression model prediction on test set: 0.89


In [25]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

print(confusion_matrix(y_test, pred))

print(classification_report(y_test, pred))

[[93 14]
 [ 8 81]]
              precision    recall  f1-score   support

           0       0.92      0.87      0.89       107
           1       0.85      0.91      0.88        89

    accuracy                           0.89       196
   macro avg       0.89      0.89      0.89       196
weighted avg       0.89      0.89      0.89       196



## Modelo de Ensamble usando la técnica de promedio

In [39]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

model1 = LogisticRegression(random_state=123)
model2 = KNeighborsClassifier(n_neighbors=5)
model3 = RandomForestClassifier(n_estimators=500)

In [40]:
model1.fit(X_train,y_train)
model2.fit(X_train,y_train)
model3.fit(X_train,y_train)

In [41]:
pred1 = model1.predict(X_test)
pred1.shape

(196,)

In [42]:
pred1

array([0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1])

In [43]:
pred1 = model1.predict_proba(X_test)
pred1[0:4,:]

array([[0.9495075 , 0.0504925 ],
       [0.92788434, 0.07211566],
       [0.0827242 , 0.9172758 ],
       [0.14450464, 0.85549536]])

In [44]:
pred2 = model2.predict_proba(X_test)
pred3 = model3.predict_proba(X_test)

In [45]:
ensembledpred = (pred1 + pred2 + pred3)/3

In [36]:
ensembledpred[0:4,:]

array([[0.8965025 , 0.1034975 ],
       [0.96129478, 0.03870522],
       [0.17890807, 0.82109193],
       [0.05683488, 0.94316512]])

In [37]:
import numpy as np
pred = np.argmax(ensembledpred, axis=1)
pred

array([0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1])

In [38]:

print(confusion_matrix(y_test, pred))

print(classification_report(y_test, pred))

[[96 11]
 [ 8 81]]
              precision    recall  f1-score   support

           0       0.92      0.90      0.91       107
           1       0.88      0.91      0.90        89

    accuracy                           0.90       196
   macro avg       0.90      0.90      0.90       196
weighted avg       0.90      0.90      0.90       196



del 0.89, ahora tenemos 0.91 usando el promedio.

**Agregamos pesos**

In [53]:
ensembledpred = (pred1 * 0.6 + pred2 * 0.20  + pred3 * 0.20)

In [47]:
ensembledpred[0:4,:]

array([[0.9217045 , 0.0782955 ],
       [0.9475306 , 0.0524694 ],
       [0.14283452, 0.85716548],
       [0.09030278, 0.90969722]])

In [54]:
pred = np.argmax(ensembledpred, axis=1)
pred

array([0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1])

In [55]:
print(confusion_matrix(y_test, pred))

print(classification_report(y_test, pred))

[[94 13]
 [ 8 81]]
              precision    recall  f1-score   support

           0       0.92      0.88      0.90       107
           1       0.86      0.91      0.89        89

    accuracy                           0.89       196
   macro avg       0.89      0.89      0.89       196
weighted avg       0.89      0.89      0.89       196



In [56]:
# Defining the Voting classifier and three individual learners
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Defining the models
model1 = LogisticRegression(random_state=123)
model2 = KNeighborsClassifier(n_neighbors=5)
model3 = RandomForestClassifier(n_estimators=500)

In [57]:
# Defining the ensemble model using VotingClassifier
model = VotingClassifier(estimators=[('lr', model1),('knn', model2),('rf',model3)], voting='hard')

In [58]:
model.fit(X_train,y_train)

In [59]:
model.score(X_test,y_test)

0.9030612244897959

In [60]:
preds = model.predict(X_test)

In [61]:
print(confusion_matrix(y_test, pred))

print(classification_report(y_test, pred))

[[94 13]
 [ 8 81]]
              precision    recall  f1-score   support

           0       0.92      0.88      0.90       107
           1       0.86      0.91      0.89        89

    accuracy                           0.89       196
   macro avg       0.89      0.89      0.89       196
weighted avg       0.89      0.89      0.89       196



## Ensamble con Boosting

In [62]:
from sklearn.ensemble import AdaBoostClassifier

bmodel=RandomForestClassifier(random_state=123)

bosting = AdaBoostClassifier(base_estimator=bmodel, n_estimators=300)

In [70]:
model = bosting.fit(X_train, y_train)

In [71]:
pred = bosting.predict(X_test)

In [72]:
print(confusion_matrix(y_test, pred))

print(classification_report(y_test, pred))

[[95 12]
 [ 7 82]]
              precision    recall  f1-score   support

           0       0.93      0.89      0.91       107
           1       0.87      0.92      0.90        89

    accuracy                           0.90       196
   macro avg       0.90      0.90      0.90       196
weighted avg       0.90      0.90      0.90       196



## Stacking

In [73]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

bl1 = KNeighborsClassifier(n_neighbors=5)
bl2 = LogisticRegression(random_state=123) 
ml = RandomForestClassifier(random_state=123)

In [74]:
from mlxtend.classifier import StackingClassifier

model_stack = StackingClassifier(classifiers=[bl1, bl2], meta_classifier=ml)

In [75]:
model = model_stack.fit(X_train, y_train)

In [77]:
pred = model.predict(X_test)

In [78]:
print(confusion_matrix(y_test, pred))

print(classification_report(y_test, pred))

[[99  8]
 [18 71]]
              precision    recall  f1-score   support

           0       0.85      0.93      0.88       107
           1       0.90      0.80      0.85        89

    accuracy                           0.87       196
   macro avg       0.87      0.86      0.86       196
weighted avg       0.87      0.87      0.87       196

