In [14]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('..')
from util.rn_multiclass import MulticlassRN
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer


In [15]:
df_original = pd.read_csv('../data/drugs_train.csv')
df = df_original.copy()

df.head()

Unnamed: 0,Age,Sex,BP,Cholesterol,Na,K,Drug
0,16,M,LOW,HIGH,0.743021,0.061886,drugC
1,42,F,HIGH,HIGH,0.533228,0.025348,drugY
2,33,F,LOW,HIGH,0.858387,0.025634,drugY
3,47,M,LOW,HIGH,0.697269,0.068944,drugC
4,56,F,HIGH,HIGH,0.750962,0.029571,drugY


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 160 entries, 0 to 159
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Age          160 non-null    int64  
 1   Sex          160 non-null    object 
 2   BP           160 non-null    object 
 3   Cholesterol  160 non-null    object 
 4   Na           160 non-null    float64
 5   K            160 non-null    float64
 6   Drug         160 non-null    object 
dtypes: float64(2), int64(1), object(4)
memory usage: 8.9+ KB


In [17]:
df.describe(include='all')

Unnamed: 0,Age,Sex,BP,Cholesterol,Na,K,Drug
count,160.0,160,160,160,160.0,160.0,160
unique,,2,3,2,,,5
top,,M,HIGH,HIGH,,,drugY
freq,,81,60,82,,,73
mean,44.73125,,,,0.700157,0.050815,
std,16.975647,,,,0.116932,0.017298,
min,15.0,,,,0.500169,0.020042,
25%,31.0,,,,0.59257,0.035392,
50%,45.0,,,,0.724923,0.050363,
75%,59.25,,,,0.794657,0.066203,


In [18]:
print(df['Cholesterol'].value_counts())
print((df['Sex']).value_counts())
print((df['BP']).value_counts())

Cholesterol
HIGH      82
NORMAL    78
Name: count, dtype: int64
Sex
M    81
F    79
Name: count, dtype: int64
BP
HIGH      60
LOW       51
NORMAL    49
Name: count, dtype: int64


A cholesterol y sex la vamos a mapear a 0 y 1

In [19]:
from sklearn.preprocessing import OrdinalEncoder


X_train = df.drop(columns=['Drug'])
y_test = df['Drug']

num_features = ['Age', 'Na', 'K']
cat_features = ['Sex', 'Cholesterol']
ordinal_features = ['BP']

chol_order = [['LOW', 'NORMAL', 'HIGH']]

num_pipe = Pipeline([
    ('scaler', StandardScaler())
])

ohe_pipe = Pipeline([
    ('ohe', OneHotEncoder(drop='if_binary', handle_unknown='ignore'))
])
ordinal_pipe = Pipeline([
    ('ord',OrdinalEncoder(categories=chol_order))
])

preprocessor = ColumnTransformer([
    ('ord', ordinal_pipe, ordinal_features),
    ('cat', ohe_pipe, cat_features),
    ('num', num_pipe, num_features)
])

clf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('model', MulticlassRN(learning_rate=0.01, max_iter=1000, activation='softmax'))
    ]
)




In [20]:
drug_encoder = OneHotEncoder(sparse_output=False)
y_train_onehot = drug_encoder.fit_transform(y_test.values.reshape(-1,1))

clf.fit(X_train, y_train_onehot)
probabilities = clf.predict_proba(X_train)
drug_names = drug_encoder.categories_[0]
for i in range(len(X_train)):
    print(f"Sample {i}:")
    for drug, prob in zip(drug_names, probabilities[i]):
        print(f"  {drug}: {prob:.3f}")

Sample 0:
  drugA: 0.000
  drugB: 0.000
  drugC: 0.957
  drugX: 0.043
  drugY: 0.000
Sample 1:
  drugA: 0.000
  drugB: 0.000
  drugC: 0.000
  drugX: 0.000
  drugY: 1.000
Sample 2:
  drugA: 0.000
  drugB: 0.000
  drugC: 0.000
  drugX: 0.000
  drugY: 1.000
Sample 3:
  drugA: 0.000
  drugB: 0.000
  drugC: 0.965
  drugX: 0.035
  drugY: 0.000
Sample 4:
  drugA: 0.000
  drugB: 0.000
  drugC: 0.000
  drugX: 0.000
  drugY: 1.000
Sample 5:
  drugA: 0.921
  drugB: 0.006
  drugC: 0.000
  drugX: 0.073
  drugY: 0.000
Sample 6:
  drugA: 0.000
  drugB: 0.005
  drugC: 0.000
  drugX: 0.006
  drugY: 0.990
Sample 7:
  drugA: 0.000
  drugB: 0.000
  drugC: 0.000
  drugX: 0.000
  drugY: 1.000
Sample 8:
  drugA: 0.000
  drugB: 0.000
  drugC: 0.000
  drugX: 0.000
  drugY: 1.000
Sample 9:
  drugA: 0.059
  drugB: 0.914
  drugC: 0.000
  drugX: 0.027
  drugY: 0.000
Sample 10:
  drugA: 0.000
  drugB: 0.000
  drugC: 0.000
  drugX: 0.000
  drugY: 1.000
Sample 11:
  drugA: 0.989
  drugB: 0.002
  drugC: 0.000
  drugX:

