In [212]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
sys.path.append('..')
from util.rn_multiclass import MulticlassRN
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer


In [213]:

df_original = pd.read_csv('../data/automobile-simple.csv')
df = df_original.copy()

df.head()

Unnamed: 0,make,fuel-type,num-of-doors,body-style,curb-weight,engine-size,horsepower,city-mpg,highway-mpg,price,volume,eco-rating
0,alfa-romero,gas,two,convertible,2548,130,111.0,21,27,13495.0,528019.904,33.297462
1,alfa-romero,gas,two,convertible,2548,130,111.0,21,27,16500.0,528019.904,33.297462
2,alfa-romero,gas,two,hatchback,2823,152,154.0,19,26,16500.0,587592.64,30.898272
3,audi,gas,four,sedan,2337,109,102.0,24,30,13950.0,634816.956,42.697819
4,audi,gas,four,sedan,2824,136,115.0,18,22,17450.0,636734.832,27.997459


In [214]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   make          205 non-null    object 
 1   fuel-type     205 non-null    object 
 2   num-of-doors  203 non-null    object 
 3   body-style    205 non-null    object 
 4   curb-weight   205 non-null    int64  
 5   engine-size   205 non-null    int64  
 6   horsepower    203 non-null    float64
 7   city-mpg      205 non-null    int64  
 8   highway-mpg   205 non-null    int64  
 9   price         201 non-null    float64
 10  volume        205 non-null    float64
 11  eco-rating    205 non-null    float64
dtypes: float64(4), int64(4), object(4)
memory usage: 19.3+ KB


In [215]:
df.describe(include='all')

Unnamed: 0,make,fuel-type,num-of-doors,body-style,curb-weight,engine-size,horsepower,city-mpg,highway-mpg,price,volume,eco-rating
count,205,205,203,205,205.0,205.0,203.0,205.0,205.0,201.0,205.0,205.0
unique,22,2,2,5,,,,,,,,
top,toyota,gas,four,sedan,,,,,,,,
freq,32,185,114,96,,,,,,,,
mean,,,,,2555.565854,126.907317,104.256158,25.219512,30.75122,13207.129353,618719.288873,42.235315
std,,,,,520.680204,41.642693,39.714369,6.542142,6.886443,7947.066342,79463.195262,12.299628
min,,,,,1488.0,61.0,48.0,13.0,16.0,5118.0,452643.156,15.501957
25%,,,,,2145.0,97.0,70.0,19.0,25.0,7775.0,566490.6,31.972844
50%,,,,,2414.0,120.0,95.0,24.0,30.0,10295.0,601385.7,40.619311
75%,,,,,2935.0,141.0,116.0,30.0,34.0,16500.0,666250.2,50.77166


In [216]:
df.isnull().sum()

make            0
fuel-type       0
num-of-doors    2
body-style      0
curb-weight     0
engine-size     0
horsepower      2
city-mpg        0
highway-mpg     0
price           4
volume          0
eco-rating      0
dtype: int64

### Rellenando nulos. Como son features, en vez de eliminarlos rellenamos con moda en el caso de num-of-doors y con mediana en los otros casos que son numericas(al final pide que eliminemos)

In [217]:
# df['num-of-doors'] = df['num-of-doors'].fillna(df['num-of-doors'].mode()[0])
# df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())
# df['price'] = df['price'].fillna(df['price'].median())
df = df.dropna(subset=['num-of-doors', 'horsepower', 'price'])

In [218]:
df.isnull().sum()

make            0
fuel-type       0
num-of-doors    0
body-style      0
curb-weight     0
engine-size     0
horsepower      0
city-mpg        0
highway-mpg     0
price           0
volume          0
eco-rating      0
dtype: int64

In [219]:
from sklearn.preprocessing import FunctionTransformer
num_features = ['curb-weight', 'engine-size', 'horsepower', 'city-mpg', 'highway-mpg', 'price', 'volume']
cat_features = ['make', 'fuel-type', 'body-style']

doors_map = {'two': 2, 'four': 4}
map_doors = FunctionTransformer(
    lambda a: np.vectorize(doors_map.get)(a), 
    feature_names_out='one-to-one'
)

categorical_preprocess = Pipeline(steps=[
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=True))
])
numeric_preprocess = Pipeline(steps=[
    ('scaler', StandardScaler())
])

preprocess = ColumnTransformer(transformers=[
    ('cat', categorical_preprocess, cat_features),
    ('num', numeric_preprocess, num_features),
    ('doors', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')),
                        ('map', map_doors)]), ['num-of-doors'])
])

In [220]:
activation = 'sigmoid'

clf = Pipeline(steps=[
    ("prep", preprocess),
    ('model', MulticlassRN(learning_rate=0.1, max_iter=400, activation=activation, cost='bce', verbose=True))
])

X = df.drop(columns=['eco-rating'])
y = (df['eco-rating'] > df['eco-rating'].mean()).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
clf.fit(X_train, y_train)
print("Accuracy", clf.score(X_test, y_test))


=== STARTING TRAINING ===
Activation function: sigmoid
Cost function: bce
Learning rate: 0.1
Max iterations: 400
Tolerance: 1e-06
X shape: (157, 35)
y shape: (157,)
--------------------------------------------------
Initial weights: [ 8.94774792e-03 -2.92925763e-03 -8.69747648e-03  2.52403379e-02
  1.72651438e-03 -8.27535901e-03 -1.09979840e-02 -4.10838173e-03
 -9.71516305e-03 -8.17904597e-03  1.34444831e-02 -1.25380808e-02
 -1.23683286e-03  4.92087474e-03 -1.05902716e-05  2.33682460e-03
 -3.20171195e-03 -4.25265132e-03  3.09979504e-03  1.01716729e-02
  5.13349775e-03 -1.73888987e-02  4.20306829e-03  1.54774961e-02
  1.56833643e-02  3.18907577e-04 -9.33956875e-03  7.93787821e-04
 -1.19979987e-02  9.10668899e-03  1.25071527e-02 -2.90721503e-03
  8.50654202e-03  8.96470529e-03  1.14440728e-02]
Initial bias: 0

Iter   Total Loss     Avg Loss     Bias      
------------------------------------------------------------
0      37.751179      0.240453     -0.1692   
1      20.359752      0.129

40     4.464680       0.028437     -1.4722   
50     3.962069       0.025236     -1.5612   
60     3.578660       0.022794     -1.6300   
70     3.269622       0.020826     -1.6847   
80     3.012057       0.019185     -1.7292   
90     2.792584       0.017787     -1.7662   
100    2.602586       0.016577     -1.7977   
110    2.436132       0.015517     -1.8248   
120    2.288935       0.014579     -1.8485   
130    2.157778       0.013744     -1.8695   
140    2.040172       0.012995     -1.8884   
150    1.934148       0.012319     -1.9054   
160    1.838113       0.011708     -1.9210   
170    1.750760       0.011151     -1.9353   
180    1.671002       0.010643     -1.9486   
190    1.597926       0.010178     -1.9609   
200    1.530757       0.009750     -1.9725   
210    1.468833       0.009356     -1.9833   
220    1.411585       0.008991     -1.9936   
230    1.358523       0.008653     -2.0033   
240    1.309218       0.008339     -2.0125   
250    1.263298       0.008046    



In [221]:
prep = clf.named_steps['prep']
feature_names = prep.get_feature_names_out(input_features=X_train.columns)

# obtener pesos del modelo
model = clf.named_steps['model']          # MulticlassRN
w = model.weights                         # shape: (n_features,)
b = model.bias                            # escalar

import numpy as np
import pandas as pd

assert len(w) == len(feature_names), "Dimensiones no coinciden con las features transformadas"

weights_df = pd.DataFrame({
    'feature': feature_names,
    'weight': w,
    'abs_weight': np.abs(w)
}).sort_values('abs_weight', ascending=False).reset_index(drop=True)

bias = pd.Series({'bias': b})

print("Bias:")
display(bias)
print("Pesos por feature:")
display(weights_df.head(30))

Bias:


bias   -2.119524
dtype: float64

Pesos por feature:


Unnamed: 0,feature,weight,abs_weight
0,num__highway-mpg,11.738054,11.738054
1,num__city-mpg,8.32629,8.32629
2,cat__make_audi,4.962165,4.962165
3,num__curb-weight,-4.708738,4.708738
4,num__horsepower,-4.274041,4.274041
5,cat__make_peugot,-3.96224,3.96224
6,cat__fuel-type_diesel,-3.770658,3.770658
7,cat__make_isuzu,-2.857442,2.857442
8,num__volume,2.822753,2.822753
9,cat__make_honda,2.547107,2.547107


### 