In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection, utils
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Clasificador de regresión logística

## Predicción de *creditability*

### Análisis de dataset

In [None]:
data_df = pd.read_csv(
        "./german_credit.csv", header=0, sep=',')

# change 0 to "No" and 1 to "Yes"
display_data = data_df[["Creditability"]].replace(
    {0: "No", 1: "Yes"})

# plot histogram
display_data.apply(pd.value_counts).T.plot(kind='bar', rot=0)


### Resultados del clasificador

In [None]:

X = data_df.drop('Creditability', axis=1)
y = data_df['Creditability']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=1)

classifier = LogisticRegression(random_state = 0, max_iter = 1000)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
coefs_by_col = pd.DataFrame({'Coef': classifier.coef_[0], 'Attribute': X.columns})
coefs_by_col.plot.barh(x='Attribute', y='Coef', figsize=(20, 8))
plt.show()

matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(matrix, annot=True, fmt='d', cmap='Blues')

## Predicción de género (mujer: 0, hombre: 1)

### Análisis de dataset

In [None]:
data_df = pd.read_csv(
        "./german_credit.csv", header=0, sep=',')

# MUJER: 0, HOMBRE: 1
data_df.loc[data_df["Sex & Marital Status"] == 2, "Sex & Marital Status"] = 1
data_df.loc[data_df["Sex & Marital Status"] == 3, "Sex & Marital Status"] = 1
data_df.loc[data_df["Sex & Marital Status"] == 4, "Sex & Marital Status"] = 0

display_data = data_df[["Sex & Marital Status"]].replace(
    {0: "Mujer", 1: "Hombre"})

# plot histogram
display_data.apply(pd.value_counts).T.plot(kind='bar', rot=0)

### Resultados del clasificador

In [None]:
X = data_df.drop('Sex & Marital Status', axis=1)
y = data_df['Sex & Marital Status']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.4, random_state=1)

classifier = LogisticRegression(random_state = 0, max_iter = 1000)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
coefs_by_col = pd.DataFrame({'Coef': classifier.coef_[0], 'Attribute': X.columns})
coefs_by_col.plot.barh(x='Attribute', y='Coef', figsize=(20, 8))
plt.show()

matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(matrix, annot=True, fmt='d', cmap='Blues')

## *Creditability* solo teniendo en cuenta la columna *Sex & Marital Status*

In [None]:
data_df = pd.read_csv(
        "./german_credit.csv", header=0, sep=',')

# drop columns different than sex and creditability
data_df = data_df.drop(data_df.columns.difference(['Creditability','Sex & Marital Status']), 1)

X = data_df.drop('Creditability', axis=1)
y = data_df['Creditability']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.4, random_state=1)

classifier = LogisticRegression(random_state = 0, max_iter = 1000)
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
coefs_by_col = pd.DataFrame({'Coef': classifier.coef_[0], 'Attribute': X.columns})
coefs_by_col.plot.barh(x='Attribute', y='Coef', figsize=(20, 8))
plt.show()

matrix = confusion_matrix(y_test, y_pred)
sns.heatmap(matrix, annot=True, fmt='d', cmap='Blues')