In [247]:
from pandas import DataFrame, read_csv, to_numeric
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix

In [196]:
data = read_csv('data/cars.csv')

In [197]:
print(data.isna().sum())

mpg            0
cylinders      0
cubicinches    0
hp             0
weightlbs      0
time-to-60     0
year           0
brand          0
dtype: int64


In [198]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 261 entries, 0 to 260
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   mpg          261 non-null    float64
 1   cylinders    261 non-null    int64  
 2   cubicinches  261 non-null    object 
 3   hp           261 non-null    int64  
 4   weightlbs    261 non-null    object 
 5   time-to-60   261 non-null    int64  
 6   year         261 non-null    int64  
 7   brand        261 non-null    object 
dtypes: float64(1), int64(4), object(3)
memory usage: 16.4+ KB
None


In [199]:
data['cubicinches'] = to_numeric(data['cubicinches'],errors='coerce')
data['weightlbs'] = to_numeric(data['weightlbs'],errors='coerce')

In [200]:
data[data['cubicinches'].isna()]

Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year,brand
40,16.0,6,,105,3897.0,19,1976,US.
180,19.8,6,,85,2990.0,18,1980,US.


In [201]:
data['cubicinches'].fillna(data['cubicinches'].mean(), inplace=True)
data['weightlbs'].fillna(data['weightlbs'].mean(), inplace=True)

In [202]:
data.describe()

Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year
count,261.0,261.0,261.0,261.0,261.0,261.0,261.0
mean,23.144828,5.590038,200.918919,106.360153,3009.833333,15.547893,1976.819923
std,7.82357,1.73331,108.837269,40.499959,849.097575,2.910625,3.637696
min,10.0,3.0,68.0,46.0,1613.0,8.0,1971.0
25%,16.9,4.0,101.0,75.0,2254.0,14.0,1974.0
50%,22.0,6.0,156.0,95.0,2904.0,16.0,1977.0
75%,28.8,8.0,302.0,138.0,3664.0,17.0,1980.0
max,46.6,8.0,455.0,230.0,4997.0,25.0,1983.0


In [203]:
data.corr()

Unnamed: 0,mpg,cylinders,cubicinches,hp,weightlbs,time-to-60,year
mpg,1.0,-0.77671,-0.802379,-0.774905,-0.8238,0.50907,0.550441
cylinders,-0.77671,1.0,0.951246,0.845155,0.894447,-0.578161,-0.322239
cubicinches,-0.802379,0.951246,1.0,0.906413,0.925288,-0.610679,-0.349743
hp,-0.774905,0.845155,0.906413,1.0,0.857125,-0.744873,-0.383869
weightlbs,-0.8238,0.894447,0.925288,0.857125,1.0,-0.47724,-0.278683
time-to-60,0.50907,-0.578161,-0.610679,-0.744873,-0.47724,1.0,0.312311
year,0.550441,-0.322239,-0.349743,-0.383869,-0.278683,0.312311,1.0


In [204]:
normaliza = StandardScaler()

X_scaled = normaliza.fit_transform(data.drop(columns=['brand']))

In [205]:
pca = PCA(n_components=7)
X_pca = pca.fit(X_scaled)

In [206]:
X_pca.explained_variance_ratio_

array([0.72371349, 0.12675138, 0.09126131, 0.02773591, 0.01766794,
       0.00798425, 0.00488572])

In [207]:
X_pca = pca.transform(X_scaled)

In [208]:
X_pca[:3]

array([[ 3.48521925,  0.5263255 , -0.13973834,  0.25879154,  0.08690179,
        -0.17920558,  0.07907785],
       [-2.17691231, -0.28542399, -1.3046126 , -0.10046548, -0.28969849,
        -0.04114712, -0.01312739],
       [ 2.6849731 ,  1.04694609, -0.73215552,  0.49822711, -0.53516815,
        -0.07574501,  0.15995575]])

In [209]:
kmeans = KMeans(n_clusters=3, random_state=42)
kmeans.fit(X_pca[:3])

KMeans(n_clusters=3, random_state=42)

In [210]:
kmeans.cluster_centers_

array([[ 2.6849731 ,  1.04694609, -0.73215552,  0.49822711, -0.53516815,
        -0.07574501,  0.15995575],
       [-2.17691231, -0.28542399, -1.3046126 , -0.10046548, -0.28969849,
        -0.04114712, -0.01312739],
       [ 3.48521925,  0.5263255 , -0.13973834,  0.25879154,  0.08690179,
        -0.17920558,  0.07907785]])

In [244]:
data['eficiencia'] = ''
data.loc[data['mpg'] > 25, 'eficiencia'] = 1
data.loc[data['mpg'] <= 25, 'eficiencia'] = 0
data['eficiencia'] = data['eficiencia'].astype('int')

In [248]:
X_data = data[['cylinders' ,'cubicinches' ,'hp' ,'weightlbs' ,'time-to-60']]
y_data = data['eficiencia']

x_train, x_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.30, random_state=42)

clf_tree = DecisionTreeClassifier(random_state=42)
clf_tree.fit(x_train, y_train)
predito = clf_tree.predict(x_test)

print(accuracy_score(y_test, predito) * 100)

87.34177215189874


In [249]:
confusion_matrix(y_test, predito)

array([[33,  8],
       [ 2, 36]])

In [250]:
clf_log = LogisticRegression(random_state=42)
clf_log.fit(x_train, y_train)
predito = clf_log.predict(x_test)
print(accuracy_score(y_test, predito) * 100)

86.07594936708861


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [251]:
confusion_matrix(y_test, predito)

array([[36,  5],
       [ 6, 32]])