# Titanic con regresión logística

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB  # Para Naive Bayes Gaussiano
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/datasciencedojo/datasets/master/titanic.csv')  # update this to where your file is located

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
# Verifica los valores faltantes en cada columna
missing_values = df.isnull().sum()

# Muestra los valores faltantes en cada columna de manera ordenada
print(missing_values)

In [None]:
# Calcula la matriz de correlación
correlation_matrix = df.corr()

# Crea un mapa de calor (heatmap) de la matriz de correlación
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title("Matriz de Correlación del Titanic")
plt.show()

In [None]:
# Contar los elementos de cada clase en 'Survived'
survived_counts = df['Survived'].value_counts()

print(survived_counts)

# Crear un gráfico de barras utilizando Seaborn
plt.figure(figsize=(8, 6))
sns.barplot(x=survived_counts.index, y=survived_counts.values)
plt.title('Conteo de Sobrevivientes (1) y No Sobrevivientes (0)')
plt.xlabel('Estado de Supervivencia')
plt.ylabel('Cantidad')
plt.show()

In [None]:
df = df.drop(['Cabin', 'Ticket', 'Name', 'PassengerId'], axis=1)

In [None]:
df.head()

In [None]:
# Supongamos que 'df' es tu DataFrame del Titanic
# Calcula la media de edad
mean_age = df['Age'].mean()

# Imputa los valores faltantes en 'Age' con la media
df['Age'].fillna(mean_age, inplace=True)

In [None]:
df = pd.get_dummies(df, drop_first=True)

In [None]:
df.head()

In [None]:
X = df.drop('Survived', axis=1)
y = df['Survived']

En el caso de regresión logística obtuvimos:

Matriz de Confusión:
 [[465  84]
 [104 238]]
Accuracy: 0.7890011223344556
Recall: 0.695906432748538
Precision: 0.7391304347826086
F1 Score: 0.716867469879518
Informe de Clasificación:
               precision    recall  f1-score   support

           0       0.82      0.85      0.83       549
           1       0.74      0.70      0.72       342

    accuracy                           0.79       891
   macro avg       0.78      0.77      0.77       891
weighted avg       0.79      0.79      0.79       891

In [None]:
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, classification_report

# Crear un modelo con Naïve Bayes
model = GaussianNB()

# Realizar la validación cruzada y obtener las predicciones
y_pred = cross_val_predict(model, X, y, cv=5)

# Calcular las métricas de evaluación
confusion = confusion_matrix(y, y_pred)
accuracy = accuracy_score(y, y_pred)
recall = recall_score(y, y_pred)
precision = precision_score(y, y_pred)
f1 = f1_score(y, y_pred)
classification_rep = classification_report(y, y_pred)

# Mostrar la matriz de confusión
print("Matriz de Confusión:\n", confusion)

# Mostrar las métricas de evaluación
print("Accuracy:", accuracy)
print("Recall:", recall)
print("Precision:", precision)
print("F1 Score:", f1)

# Mostrar el informe de clasificación por clase
print("Informe de Clasificación:\n", classification_rep)
