In [97]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [98]:
df_train_original = pd.read_csv('../data/drugs_train.csv')
df_train = df_train_original.copy()


df_test_original = pd.read_csv('../data/drugs_test.csv')
df_test = df_test_original.copy()




df_train.describe(include='all')

Unnamed: 0,Age,Sex,BP,Cholesterol,Na,K,Drug
count,160.0,160,160,160,160.0,160.0,160
unique,,2,3,2,,,5
top,,M,HIGH,HIGH,,,drugY
freq,,81,60,82,,,73
mean,44.73125,,,,0.700157,0.050815,
std,16.975647,,,,0.116932,0.017298,
min,15.0,,,,0.500169,0.020042,
25%,31.0,,,,0.59257,0.035392,
50%,45.0,,,,0.724923,0.050363,
75%,59.25,,,,0.794657,0.066203,


In [99]:


# 1. ENCODING SEX: F→0, M→1
print("=== ENCODING SEX ===")
df_train['Sex'] = df_train["Sex"].map({"F": 0, "M": 1})
df_test['Sex'] = df_test["Sex"].map({"F": 0, "M": 1})

print(f"Train Sex distribution: {df_train['Sex'].value_counts().to_dict()}")
print(f"Test Sex distribution: {df_test['Sex'].value_counts().to_dict()}")

# 2. ENCODING BP: OneHotEncoder con manejo de valores desconocidos
print("\n=== ENCODING BP CON ONEHOTENCODER ===")
# Limpiar y estandarizar valores
df_train["BP"] = df_train["BP"].str.strip().str.upper()
df_test["BP"] = df_test["BP"].str.strip().str.upper()

print(f"Train BP unique values: {df_train['BP'].unique()}")
print(f"Test BP unique values: {df_test['BP'].unique()}")

# Crear y ajustar OneHotEncoder
# handle_unknown='ignore' permite manejar valores no vistos en entrenamiento
bp_encoder = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)

# Ajustar solo con datos de entrenamiento
bp_encoded_train = bp_encoder.fit_transform(df_train[["BP"]])
bp_encoded_test = bp_encoder.transform(df_test[["BP"]])

# Crear nombres de columnas
bp_columns = [f"BP_{cat}" for cat in bp_encoder.categories_[0][1:]]  # [1:] porque drop='first'

# Convertir a DataFrame
bp_train_df = pd.DataFrame(bp_encoded_train, columns=bp_columns, index=df_train.index)
bp_test_df = pd.DataFrame(bp_encoded_test, columns=bp_columns, index=df_test.index)

# Concatenar con el dataset original (sin la columna BP)
df_train = pd.concat([df_train.drop(columns=["BP"]), bp_train_df], axis=1)
df_test = pd.concat([df_test.drop(columns=["BP"]), bp_test_df], axis=1)

print(f"Categorías aprendidas por el encoder: {bp_encoder.categories_[0]}")
print(f"Columnas BP creadas: {bp_columns}")
print(f"Train shape after BP encoding: {df_train.shape}")
print(f"Test shape after BP encoding: {df_test.shape}")

# Verificar que no hay valores NaN
print(f"Valores NaN en train BP columns: {bp_train_df.isnull().sum().sum()}")
print(f"Valores NaN en test BP columns: {bp_test_df.isnull().sum().sum()}")

df_train.head()

=== ENCODING SEX ===
Train Sex distribution: {1: 81, 0: 79}
Test Sex distribution: {1: 23, 0: 17}

=== ENCODING BP CON ONEHOTENCODER ===
Train BP unique values: ['LOW' 'HIGH' 'NORMAL']
Test BP unique values: ['LOW' 'HIGH' 'NORMAL']
Categorías aprendidas por el encoder: ['HIGH' 'LOW' 'NORMAL']
Columnas BP creadas: ['BP_LOW', 'BP_NORMAL']
Train shape after BP encoding: (160, 8)
Test shape after BP encoding: (40, 8)
Valores NaN en train BP columns: 0
Valores NaN en test BP columns: 0


Unnamed: 0,Age,Sex,Cholesterol,Na,K,Drug,BP_LOW,BP_NORMAL
0,16,1,HIGH,0.743021,0.061886,drugC,1.0,0.0
1,42,0,HIGH,0.533228,0.025348,drugY,0.0,0.0
2,33,0,HIGH,0.858387,0.025634,drugY,1.0,0.0
3,47,1,HIGH,0.697269,0.068944,drugC,1.0,0.0
4,56,0,HIGH,0.750962,0.029571,drugY,0.0,0.0


In [100]:
# 3. ENCODING CHOLESTEROL: NORMAL→0, HIGH→1
print("=== ENCODING CHOLESTEROL ===")

# Limpiar y estandarizar valores
df_train["Cholesterol"] = df_train["Cholesterol"].str.strip().str.upper()
df_test["Cholesterol"] = df_test["Cholesterol"].str.strip().str.upper()

print(f"Train Cholesterol unique values: {df_train['Cholesterol'].unique()}")
print(f"Test Cholesterol unique values: {df_test['Cholesterol'].unique()}")

# Mapear a binario: NORMAL→0, HIGH→1
df_train["Cholesterol"] = df_train["Cholesterol"].map({"NORMAL": 0, "HIGH": 1}).astype("int8")
df_test["Cholesterol"] = df_test["Cholesterol"].map({"NORMAL": 0, "HIGH": 1}).astype("int8")

print(f"Train Cholesterol distribution: {df_train['Cholesterol'].value_counts().to_dict()}")
print(f"Test Cholesterol distribution: {df_test['Cholesterol'].value_counts().to_dict()}")

df_train.head()

=== ENCODING CHOLESTEROL ===
Train Cholesterol unique values: ['HIGH' 'NORMAL']
Test Cholesterol unique values: ['HIGH' 'NORMAL']
Train Cholesterol distribution: {1: 82, 0: 78}
Test Cholesterol distribution: {1: 21, 0: 19}


Unnamed: 0,Age,Sex,Cholesterol,Na,K,Drug,BP_LOW,BP_NORMAL
0,16,1,1,0.743021,0.061886,drugC,1.0,0.0
1,42,0,1,0.533228,0.025348,drugY,0.0,0.0
2,33,0,1,0.858387,0.025634,drugY,1.0,0.0
3,47,1,1,0.697269,0.068944,drugC,1.0,0.0
4,56,0,1,0.750962,0.029571,drugY,0.0,0.0


In [101]:
# 4. STANDARDIZACIÓN DE VARIABLES NUMÉRICAS
print("=== STANDARDIZACIÓN ===")

# Variables numéricas a estandarizar
cols_to_standardize = ["Age", "Na", "K"]

print(f"Variables a estandarizar: {cols_to_standardize}")

# Ajustar el scaler solo con datos de entrenamiento
scaler = StandardScaler().fit(df_train[cols_to_standardize])

# Aplicar transformación a train y test
df_train[cols_to_standardize] = scaler.transform(df_train[cols_to_standardize])
df_test[cols_to_standardize] = scaler.transform(df_test[cols_to_standardize])

print(f"Train shape final: {df_train.shape}")
print(f"Test shape final: {df_test.shape}")

# Verificar que las columnas coincidan
print(f"Train columns: {list(df_train.columns)}")
print(f"Test columns: {list(df_test.columns)}")
print(f"Columns match: {list(df_train.columns) == list(df_test.columns)}")

df_train.head()

=== STANDARDIZACIÓN ===
Variables a estandarizar: ['Age', 'Na', 'K']
Train shape final: (160, 8)
Test shape final: (40, 8)
Train columns: ['Age', 'Sex', 'Cholesterol', 'Na', 'K', 'Drug', 'BP_LOW', 'BP_NORMAL']
Test columns: ['Age', 'Sex', 'Cholesterol', 'Na', 'K', 'Drug', 'BP_LOW', 'BP_NORMAL']
Columns match: True


Unnamed: 0,Age,Sex,Cholesterol,Na,K,Drug,BP_LOW,BP_NORMAL
0,-1.697812,1,1,0.367718,0.642032,drugC,1.0,0.0
1,-0.161397,0,1,-1.432057,-1.476815,drugY,0.0,0.0
2,-0.693233,0,1,1.357422,-1.46023,drugY,1.0,0.0
3,0.134067,1,1,-0.02478,1.051328,drugC,1.0,0.0
4,0.665903,0,1,0.435842,-1.231922,drugY,0.0,0.0


In [102]:
# 5. VERIFICACIÓN FINAL Y ANÁLISIS DE DATOS
print("=== VERIFICACIÓN FINAL ===")

# Verificar tipos de datos
print("Tipos de datos en train:")
print(df_train.dtypes)
print("\nTipos de datos en test:")
print(df_test.dtypes)

# Verificar valores nulos
print(f"\nValores nulos en train: {df_train.isnull().sum().sum()}")
print(f"Valores nulos en test: {df_test.isnull().sum().sum()}")

# Estadísticas descriptivas
print("\n=== ESTADÍSTICAS DESCRIPTIVAS ===")
print("Train dataset:")
print(df_train.describe())

print("\nTest dataset:")
print(df_test.describe())

# Verificar distribución de la variable objetivo (si existe)
if 'Drug' in df_train.columns:
    print(f"\nDistribución de Drug en train: {df_train['Drug'].value_counts().to_dict()}")
    
# Mostrar las primeras filas de ambos datasets
print("\n=== PRIMERAS FILAS ===")
print("Train dataset:")
print(df_train.head())

print("\nTest dataset:")
print(df_test.head())


=== VERIFICACIÓN FINAL ===
Tipos de datos en train:
Age            float64
Sex              int64
Cholesterol       int8
Na             float64
K              float64
Drug            object
BP_LOW         float64
BP_NORMAL      float64
dtype: object

Tipos de datos en test:
Age            float64
Sex              int64
Cholesterol       int8
Na             float64
K              float64
Drug            object
BP_LOW         float64
BP_NORMAL      float64
dtype: object

Valores nulos en train: 0
Valores nulos en test: 0

=== ESTADÍSTICAS DESCRIPTIVAS ===
Train dataset:
                Age         Sex  Cholesterol            Na             K  \
count  1.600000e+02  160.000000   160.000000  1.600000e+02  1.600000e+02   
mean  -1.720846e-16    0.506250     0.512500 -1.181000e-15 -8.881784e-17   
std    1.003140e+00    0.501531     0.501413  1.003140e+00  1.003140e+00   
min   -1.756905e+00    0.000000     0.000000 -1.715664e+00 -1.784511e+00   
25%   -8.114190e-01    0.000000     0.000000 

In [103]:
df_train['Drug'] = (df_train['Drug'] == 'drugY').astype(int)
df_test['Drug'] = (df_test['Drug'] == 'drugY').astype(int)


df_train

Unnamed: 0,Age,Sex,Cholesterol,Na,K,Drug,BP_LOW,BP_NORMAL
0,-1.697812,1,1,0.367718,0.642032,0,1.0,0.0
1,-0.161397,0,1,-1.432057,-1.476815,1,0.0,0.0
2,-0.693233,0,1,1.357422,-1.460230,1,1.0,0.0
3,0.134067,1,1,-0.024780,1.051328,0,1.0,0.0
4,0.665903,0,1,0.435842,-1.231922,1,0.0,0.0
...,...,...,...,...,...,...,...,...
155,1.611389,1,1,-1.310933,-1.000599,1,1.0,0.0
156,-1.520533,0,0,0.359748,-1.289623,1,0.0,0.0
157,1.138646,0,0,-1.252297,-1.698280,1,1.0,0.0
158,1.138646,1,0,0.341064,-0.896855,1,0.0,0.0


In [104]:
from perceptron import Perceptron

X_train = df_train.drop(columns=['Drug'])
y_train = df_train['Drug']

X_test = df_test.drop(columns=['Drug'])
y_test = df_test['Drug']


perc = Perceptron(lr=0.1, max_iter=1000, random_state=42, verbose=False)
perc.fit(X_train, y_train)

print("Accuracy: ", perc.score(X_test, y_test))
print("w:", perc.w_, "b:", perc.b_)

Accuracy:  0.975
w: [ 0.07669166  0.08960016  0.00750451  0.50940388 -1.09309399 -0.1130218
 -0.0987216 ] b: -0.20316242592343584
