In [None]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, roc_curve
import requests
from io import StringIO

# URL dos dados
url_data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data'

# Nomes das colunas
columns = ["Status_of_existing_checking_account", "Duration_in_month", "Credit_history", "Purpose", "Credit_amount",
           "Savings_account/bonds", "Present_employment_since", "Installment_rate_in_percentage_of_disposable_income",
           "Personal_status_and_sex", "Other_debtors/guarantors", "Present_residence_since", "Property",
           "Age_in_years", "Other_installment_plans", "Housing", "Number_of_existing_credits_at_this_bank",
           "Job", "Number_of_people_being_liable_to_provide_maintenance_for", "Telephone", "foreign_worker", "class"]

# Baixar os dados
response = requests.get(url_data)
data = StringIO(response.text)

# Carregar os dados em um DataFrame
df = pd.read_csv(data, sep=' ', header=None, names=columns)

# Mapear a variável alvo para 0 e 1
df['class'] = df['class'].map({1: 0, 2: 1})


In [None]:

# Verificar as primeiras linhas do dataframe
print(df.head())

# Resumo estatístico
print(df.describe())

# Verificar valores nulos
print(df.isnull().sum())

# Distribuição das variáveis alvo
plt.figure(figsize=(6,4))
sns.countplot(x='class', data=df)
plt.title('Distribuição da Variável Alvo')
plt.show()

# Histograma das variáveis numéricas
df.hist(bins=20, figsize=(14,10), layout=(5,5))
plt.show()

# Matriz de correlação
plt.figure(figsize=(12,8))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.title('Matriz de Correlação')
plt.show()


In [None]:

# Separar variáveis numéricas e categóricas
numerical_features = df.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = df.select_dtypes(include=[object]).columns.tolist()

# Codificar variáveis categóricas
df_encoded = pd.get_dummies(df, columns=categorical_features, drop_first=True)

# Separar novamente X e y
X_encoded = df_encoded.drop(columns='class')
y_encoded = df_encoded['class']

# Dividir os dados em conjunto de treino e teste
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.3, random_state=42)

# Normalizar os dados
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:

# Treinar um modelo de regressão logística
model = LogisticRegression(random_state=42)
model.fit(X_train_scaled, y_train)

# Previsões
y_pred = model.predict(X_test_scaled)
y_prob = model.predict_proba(X_test_scaled)[:, 1]

# Avaliação do modelo
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

# Curva ROC
fpr, tpr, thresholds = roc_curve(y_test, y_prob)
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc_score(y_test, y_prob))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC)')
plt.legend(loc="lower right")
plt.show()
