In [1]:
import pandas as pd

uri = "https://gist.githubusercontent.com/guilhermesilveira/4d1d4a16ccbf6ea4e0a64a38a24ec884/raw/afd05cb0c796d18f3f5a6537053ded308ba94bf7/car-prices.csv"
df = pd.read_csv(uri, index_col=0)
df.head()

Unnamed: 0,mileage_per_year,model_year,price,sold
0,21801,2000,30941.02,yes
1,7843,1998,40557.96,yes
2,7109,2006,89627.5,no
3,26823,2015,95276.14,no
4,7935,2014,117384.68,yes


### sold para 1 e 0

In [2]:
df['sold'].unique()

array(['yes', 'no'], dtype=object)

In [3]:
trocar = {
    'no' : 0,
    'yes' : 1
}
df['sold'] = df['sold'].map(trocar)
df.head()

Unnamed: 0,mileage_per_year,model_year,price,sold
0,21801,2000,30941.02,1
1,7843,1998,40557.96,1
2,7109,2006,89627.5,0
3,26823,2015,95276.14,0
4,7935,2014,117384.68,1


### Ano modelo para idade

In [4]:
from datetime import datetime

ano_atual = datetime.today().year
df['age'] = ano_atual - df.model_year
df.head()

Unnamed: 0,mileage_per_year,model_year,price,sold,age
0,21801,2000,30941.02,1,23
1,7843,1998,40557.96,1,25
2,7109,2006,89627.5,0,17
3,26823,2015,95276.14,0,8
4,7935,2014,117384.68,1,9


### mileage_per_year para km_per_year

In [5]:
df['km_per_year'] = df.mileage_per_year * 1.609344
df.head()

Unnamed: 0,mileage_per_year,model_year,price,sold,age,km_per_year
0,21801,2000,30941.02,1,23,35085.308544
1,7843,1998,40557.96,1,25,12622.084992
2,7109,2006,89627.5,0,17,11440.826496
3,26823,2015,95276.14,0,8,43167.434112
4,7935,2014,117384.68,1,9,12770.14464


In [6]:
df = df.drop(columns=['mileage_per_year', 'model_year'], axis=1)
df.head()

Unnamed: 0,price,sold,age,km_per_year
0,30941.02,1,23,35085.308544
1,40557.96,1,25,12622.084992
2,89627.5,0,17,11440.826496
3,95276.14,0,8,43167.434112
4,117384.68,1,9,12770.14464


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import numpy as np

X = df[['price', 'age', 'km_per_year']]
y = df['sold']

# Escala as características
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)
X_scaled=X

SEED = 5
np.random.seed(SEED)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, stratify=y)
print(f'Treinando com {len(X_train)} amostras e testando com {len(X_test)} amostras')

# Aumenta o número de iterações
model = LinearSVC(dual=True)
model.fit(X_train, y_train)
previsoes = model.predict(X_test)

acuracia = accuracy_score(y_test, previsoes)
print(f'A acurácia foi de {acuracia * 100:.2f}%')

Treinando com 7500 amostras e testando com 2500 amostras
A acurácia foi de 46.88%




### Usando o DummyClassifier

In [8]:
from sklearn.dummy import DummyClassifier

In [9]:
dummy_stratified = DummyClassifier(strategy='stratified')
dummy_stratified.fit(X_train, y_train)
acuracia = dummy_stratified.score(X_test, y_test)

print(f'A acurácia do dummy stratified foi de {acuracia * 100:.2f}%')

A acurácia do dummy stratified foi de 52.44%


In [10]:
dummy_mostfrequent = DummyClassifier(strategy='most_frequent')
dummy_mostfrequent.fit(X_train, y_train)
acuracia = dummy_mostfrequent.score(X_test, y_test)

print(f'A acurácia do dummy mostfrequent foi de {acuracia * 100:.2f}%')

A acurácia do dummy mostfrequent foi de 58.00%


### Usando o SVC

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import numpy as np

X = df[['price', 'age', 'km_per_year']]
y = df['sold']

# Escala as características
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

SEED = 5
np.random.seed(SEED)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, stratify=y)
print(f'Treinando com {len(X_train)} amostras e testando com {len(X_test)} amostras')

# Aumenta o número de iterações
model = SVC()
model.fit(X_train, y_train)
previsoes = model.predict(X_test)

acuracia = accuracy_score(y_test, previsoes)
print(f'A acurácia foi de {acuracia * 100:.2f}%')

Treinando com 7500 amostras e testando com 2500 amostras
A acurácia foi de 77.52%


## Usando o DecisionTreeClassifier

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import numpy as np

X = df[['price', 'age', 'km_per_year']]
y = df['sold']

# Escala as características
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

SEED = 5
np.random.seed(SEED)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, stratify=y)
print(f'Treinando com {len(X_train)} amostras e testando com {len(X_test)} amostras')

# Aumenta o número de iterações
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
previsoes = model.predict(X_test)

acuracia = accuracy_score(y_test, previsoes)
print(f'A acurácia foi de {acuracia * 100:.2f}%')

Treinando com 7500 amostras e testando com 2500 amostras
A acurácia foi de 73.32%
