In [None]:
import pandas as pd
import numpy as np
import matplotlib.pylab as plt

In [None]:
# Cargar el archivo de datos en un DataFrame de Pandas
df = pd.read_csv('./titanic.csv')

In [None]:
# Mostrar el DataFrame
df

In [None]:
print(df)

In [None]:
# Concoer el tamaño del DataFrame
df.shape

In [None]:
# Conocer las columnas del DataFrame
df.columns

In [None]:
# Concoer los tipos de datos del DataFrame
df.dtypes

In [None]:
# Obtener información básica de los datos
df.describe()

In [None]:
# Obtener descripciones de datos no numéricos
df.describe(include='all')

In [None]:
# Eliminación de columnas del DataFrame
df = df.drop(columns=['PassengerId', 'Name', 'Ticket', 'Cabin'])
df

In [None]:
# Eliminar filas con valores perdidos
df = df.dropna()
df.shape

In [None]:
# Convertir valores categóricos a numéricos
df['Sex'] = df['Sex'].astype('category').cat.codes
df['Embarked'] = df['Embarked'].astype('category').cat.codes
df

In [None]:
# Guardar el DataFrame en un archivo
df.to_csv('./titanic_ml.csv', index=False)

In [None]:
# Función que recibe un DataFrame, una proporcion y el nombre de la clase,
# y genera cuatro conjuntos de datos para entrenamiento
# y pruebas del algoritmo de aprendizaje
from sklearn.model_selection import train_test_split

def split_label(df, test_size, label):
    train, test = train_test_split(df, test_size=test_size)
    features = df.columns.drop(label)
    train_X = train[features]
    train_Y = train[label]
    test_X = test[features]
    test_Y = test[label]
    return train_X, train_Y, test_X, test_Y

In [None]:
# Dividir el conjunto de datos de pasajeros del Titanic
train_X, train_Y, test_X, test_Y = split_label(df, 0.2, 'Survived')

In [None]:
# Aplicar one hot encodign a la columna "Embarked"
from sklearn.preprocessing import OneHotEncoder

one = OneHotEncoder(handle_unknown='ignore')

result = one.fit_transform(train_X['Embarked'].values.reshape(-1, 1)).toarray()
train_X_1 = train_X
train_X_1[['C', 'Q', 'S']] = pd.DataFrame(result, index = train_X_1.index)
train_X_1

In [None]:
# Aplicar escalamiento [0,1] a todos los valores
from sklearn.preprocessing import MinMaxScaler
min_max = MinMaxScaler()
train_X_2 = min_max.fit_transform(train_X_1.values)
train_X_2 = pd.DataFrame(train_X_2, columns=train_X_1.columns)
train_X_2

In [None]:
from sklearn.svm import SVC
clf = SVC()
clf.fit(train_X_2, train_Y)

In [None]:
result = one.fit_transform(test_X['Embarked'].values.reshape(-1, 1)).toarray()
test_X_1 = test_X
test_X_1[['C', 'Q', 'S']] = pd.DataFrame(result, index = test_X_1.index)
test_X_2 = min_max.fit_transform(test_X_1.values)
test_X_2 = pd.DataFrame(test_X_2, columns=test_X_1.columns)
clf.predict(test_X_2)

In [None]:
clf.score(test_X_2, test_Y)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
clf.fit(train_X_2, train_Y)
clf.score(test_X_2, test_Y)

In [None]:
# Transformación del DataFrame original
result = one.fit_transform(df['Embarked'].values.reshape(-1, 1)).toarray()
df_1 = df
df_1[['C', 'Q', 'S']] = pd.DataFrame(result, index = df_1.index)
df_2 = min_max.fit_transform(df_1.values)
df_2 = pd.DataFrame(df_2, columns=df_1.columns)

In [None]:
# Uso de clasificación por KMeans
from sklearn.cluster import KMeans
clu = KMeans(n_clusters=3)
clu.fit(df_2)
clu.cluster_centers_

In [None]:
clu.labels_

In [None]:
from sklearn.metrics import silhouette_score
silhouette_score(df_2, clu.labels_)

In [None]:
from sklearn.metrics import calinski_harabasz_score
calinski_harabasz_score(df_2, clu.labels_)