In [1275]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.model_selection import train_test_split

In [1276]:
df_original = pd.read_csv('../data/automobile-simple.csv')
df = df_original.copy()

df.head()



Unnamed: 0,make,fuel-type,num-of-doors,body-style,curb-weight,engine-size,horsepower,city-mpg,highway-mpg,price,volume,eco-rating
0,alfa-romero,gas,two,convertible,2548,130,111.0,21,27,13495.0,528019.904,33.297462
1,alfa-romero,gas,two,convertible,2548,130,111.0,21,27,16500.0,528019.904,33.297462
2,alfa-romero,gas,two,hatchback,2823,152,154.0,19,26,16500.0,587592.64,30.898272
3,audi,gas,four,sedan,2337,109,102.0,24,30,13950.0,634816.956,42.697819
4,audi,gas,four,sedan,2824,136,115.0,18,22,17450.0,636734.832,27.997459


In [1277]:
df.describe(include='all')

Unnamed: 0,make,fuel-type,num-of-doors,body-style,curb-weight,engine-size,horsepower,city-mpg,highway-mpg,price,volume,eco-rating
count,205,205,203,205,205.0,205.0,203.0,205.0,205.0,201.0,205.0,205.0
unique,22,2,2,5,,,,,,,,
top,toyota,gas,four,sedan,,,,,,,,
freq,32,185,114,96,,,,,,,,
mean,,,,,2555.565854,126.907317,104.256158,25.219512,30.75122,13207.129353,618719.288873,42.235315
std,,,,,520.680204,41.642693,39.714369,6.542142,6.886443,7947.066342,79463.195262,12.299628
min,,,,,1488.0,61.0,48.0,13.0,16.0,5118.0,452643.156,15.501957
25%,,,,,2145.0,97.0,70.0,19.0,25.0,7775.0,566490.6,31.972844
50%,,,,,2414.0,120.0,95.0,24.0,30.0,10295.0,601385.7,40.619311
75%,,,,,2935.0,141.0,116.0,30.0,34.0,16500.0,666250.2,50.77166


In [1278]:
df.isnull().sum()

make            0
fuel-type       0
num-of-doors    2
body-style      0
curb-weight     0
engine-size     0
horsepower      2
city-mpg        0
highway-mpg     0
price           4
volume          0
eco-rating      0
dtype: int64

### Eliminamos los registros faltantes que presentan nulls ya que son muy pocos (8/205)

In [1279]:
df = df.dropna()
df.isnull().sum()




make            0
fuel-type       0
num-of-doors    0
body-style      0
curb-weight     0
engine-size     0
horsepower      0
city-mpg        0
highway-mpg     0
price           0
volume          0
eco-rating      0
dtype: int64

In [1280]:
df.describe(include='all')


Unnamed: 0,make,fuel-type,num-of-doors,body-style,curb-weight,engine-size,horsepower,city-mpg,highway-mpg,price,volume,eco-rating
count,197,197,197,197,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0
unique,21,2,2,5,,,,,,,,
top,toyota,gas,four,sedan,,,,,,,,
freq,32,178,112,92,,,,,,,,
mean,,,,,2558.456853,126.994924,103.604061,25.152284,30.629442,13279.64467,619851.107665,42.094943
std,,,,,521.782047,41.913114,37.639205,6.437863,6.836259,8010.334218,79841.659971,12.183681
min,,,,,1488.0,61.0,48.0,13.0,16.0,5118.0,452643.156,15.501957
25%,,,,,2145.0,97.0,70.0,19.0,25.0,7775.0,566490.6,31.972844
50%,,,,,2414.0,119.0,95.0,24.0,30.0,10345.0,601385.7,40.531575
75%,,,,,2935.0,145.0,116.0,30.0,34.0,16503.0,674493.768,50.362813


In [1281]:
numeric_columns = df.select_dtypes(include=[np.number]).columns.tolist()
print("Columnas numéricas:", numeric_columns)



Columnas numéricas: ['curb-weight', 'engine-size', 'horsepower', 'city-mpg', 'highway-mpg', 'price', 'volume', 'eco-rating']


In [1282]:
correlation_matrix = df[numeric_columns].corr()
print("\nMatriz de correlación:")
correlation_matrix


Matriz de correlación:


Unnamed: 0,curb-weight,engine-size,horsepower,city-mpg,highway-mpg,price,volume,eco-rating
curb-weight,1.0,0.848932,0.759925,-0.755559,-0.800113,0.834732,0.822823,-0.842957
engine-size,0.848932,1.0,0.825286,-0.655737,-0.684662,0.873708,0.592571,-0.723965
horsepower,0.759925,0.825286,1.0,-0.82196,-0.803658,0.811953,0.449278,-0.839619
city-mpg,-0.755559,-0.655737,-0.82196,1.0,0.972407,-0.692948,-0.554978,0.97665
highway-mpg,-0.800113,-0.684662,-0.803658,0.972407,1.0,-0.708659,-0.608072,0.984258
price,0.834732,0.873708,0.811953,-0.692948,-0.708659,1.0,0.631578,-0.747982
volume,0.822823,0.592571,0.449278,-0.554978,-0.608072,0.631578,1.0,-0.59852
eco-rating,-0.842957,-0.723965,-0.839619,0.97665,0.984258,-0.747982,-0.59852,1.0


### Estandarizamos las columnas numericas

In [1283]:
scaler = StandardScaler()
df[numeric_columns] = scaler.fit_transform(df[numeric_columns])
df.head()

Unnamed: 0,make,fuel-type,num-of-doors,body-style,curb-weight,engine-size,horsepower,city-mpg,highway-mpg,price,volume,eco-rating
0,alfa-romero,gas,two,convertible,-0.020092,0.07188,0.196996,-0.646622,-0.532263,0.026953,-1.153097,-0.723911
1,alfa-romero,gas,two,convertible,-0.020092,0.07188,0.196996,-0.646622,-0.532263,0.403049,-1.153097,-0.723911
2,alfa-romero,gas,two,hatchback,0.508291,0.598113,1.342333,-0.958076,-0.678915,0.403049,-0.40506,-0.921331
3,audi,gas,four,sedan,-0.425505,-0.430433,-0.042725,-0.179442,-0.092309,0.0839,0.187922,0.049608
4,audi,gas,four,sedan,0.510212,0.215398,0.303539,-1.113802,-1.265521,0.521948,0.212004,-1.160027


### Tratamos con columnas cualitativas. Usamos OneHotEncoding

In [1284]:

#One hot encoding para columnas cualitativas nominales
columns_ohe = ['make', 'fuel-type', 'body-style']
ohe = OneHotEncoder(sparse_output=False, drop='first', handle_unknown='ignore')
df_ohe = ohe.fit_transform(df[columns_ohe])
df_ohe = pd.DataFrame(df_ohe, columns=ohe.get_feature_names_out(columns_ohe), index=df.index)
df = pd.concat([df.drop(columns_ohe, axis=1), df_ohe], axis=1)
df.describe(include='all')




Unnamed: 0,num-of-doors,curb-weight,engine-size,horsepower,city-mpg,highway-mpg,price,volume,eco-rating,make_audi,...,make_saab,make_subaru,make_toyota,make_volkswagen,make_volvo,fuel-type_gas,body-style_hardtop,body-style_hatchback,body-style_sedan,body-style_wagon
count,197,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0,...,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0
unique,2,,,,,,,,,,...,,,,,,,,,,
top,four,,,,,,,,,,...,,,,,,,,,,
freq,112,,,,,,,,,,...,,,,,,,,,,
mean,,-1.8034080000000002e-17,1.082045e-16,6.988206e-17,-3.38139e-17,-1.983749e-16,-9.918744000000001e-17,-1.983749e-16,-2.119004e-16,0.030457,...,0.030457,0.060914,0.162437,0.060914,0.055838,0.903553,0.040609,0.340102,0.467005,0.121827
std,,1.002548,1.002548,1.002548,1.002548,1.002548,1.002548,1.002548,1.002548,0.172279,...,0.172279,0.239781,0.369791,0.239781,0.230193,0.295955,0.197886,0.47495,0.500181,0.32792
min,,-2.056767,-1.578577,-1.481055,-1.892436,-2.14543,-1.021485,-2.09958,-2.188234,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,,-0.7944126,-0.7174686,-0.8950688,-0.9580755,-0.8255662,-0.6889437,-0.6700319,-0.8329082,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,,-0.2775582,-0.1912359,-0.2291755,-0.1794415,-0.09230857,-0.3672907,-0.2318646,-0.1286435,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,,0.7234869,0.4306754,0.3301749,0.7549192,0.4942975,0.4034248,0.6861315,0.6803309,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0


In [1285]:
# Ordinal encoding para columnas cualitativas ordinales
door_mappings = {'two': 2, 'four': 4}
df['num-of-doors'] = df['num-of-doors'].map(door_mappings)
df.describe(include='all')



Unnamed: 0,num-of-doors,curb-weight,engine-size,horsepower,city-mpg,highway-mpg,price,volume,eco-rating,make_audi,...,make_saab,make_subaru,make_toyota,make_volkswagen,make_volvo,fuel-type_gas,body-style_hardtop,body-style_hatchback,body-style_sedan,body-style_wagon
count,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0,...,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0,197.0
mean,3.137056,-1.8034080000000002e-17,1.082045e-16,6.988206e-17,-3.38139e-17,-1.983749e-16,-9.918744000000001e-17,-1.983749e-16,-2.119004e-16,0.030457,...,0.030457,0.060914,0.162437,0.060914,0.055838,0.903553,0.040609,0.340102,0.467005,0.121827
std,0.993087,1.002548,1.002548,1.002548,1.002548,1.002548,1.002548,1.002548,1.002548,0.172279,...,0.172279,0.239781,0.369791,0.239781,0.230193,0.295955,0.197886,0.47495,0.500181,0.32792
min,2.0,-2.056767,-1.578577,-1.481055,-1.892436,-2.14543,-1.021485,-2.09958,-2.188234,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,-0.7944126,-0.7174686,-0.8950688,-0.9580755,-0.8255662,-0.6889437,-0.6700319,-0.8329082,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,4.0,-0.2775582,-0.1912359,-0.2291755,-0.1794415,-0.09230857,-0.3672907,-0.2318646,-0.1286435,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,4.0,0.7234869,0.4306754,0.3301749,0.7549192,0.4942975,0.4034248,0.6861315,0.6803309,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
max,4.0,2.896581,4.760135,4.218992,3.713728,3.427328,4.020081,2.83978,3.373223,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [1286]:
x = df.drop(columns=['eco-rating'])

y = df['eco-rating']
y = (df['eco-rating'] > df['eco-rating'].mean()).astype(int)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)






In [1287]:
from perceptron import Perceptron 
ml = Perceptron(lr=0.01, max_iter=10000)
ml.fit(x_train, y_train)
print(ml.score(x_test, y_test))


0.975


In [1288]:
weights_df = pd.DataFrame({
    'feature': x_train.columns,
    'weight': ml.w_,
    'abs_weight': np.abs(ml.w_)
})
weights_df.sort_values(by='abs_weight', ascending=False, inplace=True)
weights_df




Unnamed: 0,feature,weight,abs_weight
5,highway-mpg,0.142023,0.142023
1,curb-weight,-0.106471,0.106471
4,city-mpg,0.084471,0.084471
3,horsepower,-0.072679,0.072679
8,make_audi,0.063187,0.063187
20,make_peugot,-0.057972,0.057972
30,body-style_hatchback,-0.044518,0.044518
32,body-style_wagon,0.039836,0.039836
6,price,-0.035845,0.035845
13,make_isuzu,-0.035009,0.035009


### Probamos borrar dos columnas fuertemente correlacionadas

In [1289]:
x = x.drop(columns=['highway-mpg', 'curb-weight', 'horsepower', 'city-mpg', 'price'])
y = y

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

ml = Perceptron(lr=0.1, max_iter=1000)
ml.fit(x_train, y_train)
print(ml.score(x_test, y_test))

weights_df = pd.DataFrame({
    'feature': x_train.columns,
    'weight': ml.w_,
    'abs_weight': np.abs(ml.w_)
})
weights_df.sort_values(by='abs_weight', ascending=False, inplace=True)
weights_df


0.775


Unnamed: 0,feature,weight,abs_weight
16,make_plymouth,3.303885,3.303885
15,make_peugot,-2.90636,2.90636
22,make_volvo,-2.900735,2.900735
8,make_isuzu,-2.413927,2.413927
4,make_bmw,-2.27673,2.27673
23,fuel-type_gas,-2.19617,2.19617
11,make_mercedes-benz,-1.975408,1.975408
18,make_saab,-1.804273,1.804273
14,make_nissan,1.699666,1.699666
7,make_honda,1.693233,1.693233
