# Projeto - Vendas  de carro

In [1]:
import pandas as pd

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
import numpy as np

In [3]:
url = 'https://gist.githubusercontent.com/guilhermesilveira/4d1d4a16ccbf6ea4e0a64a38a24ec884/raw/afd05cb0c796d18f3f5a6537053ded308ba94bf7/car-prices.csv'
df = pd.read_csv(url)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,mileage_per_year,model_year,price,sold
0,0,21801,2000,30941.02,yes
1,1,7843,1998,40557.96,yes
2,2,7109,2006,89627.5,no
3,3,26823,2015,95276.14,no
4,4,7935,2014,117384.68,yes


In [5]:
mapa_colunas = {
    'Unnamed: 0': 'id',
    'mileage_per_year': 'milhas_por_ano',
    'model_year': 'ano_do_modelo',
    'price': 'preco',
    'sold': 'vendido'
}

In [6]:
df.rename(columns=mapa_colunas, inplace=True)

In [7]:
df.head()

Unnamed: 0,id,milhas_por_ano,ano_do_modelo,preco,vendido
0,0,21801,2000,30941.02,yes
1,1,7843,1998,40557.96,yes
2,2,7109,2006,89627.5,no
3,3,26823,2015,95276.14,no
4,4,7935,2014,117384.68,yes


In [8]:
mapa_vendido = {
    'yes': 1,
    'no': 0
}

In [9]:
df['vendido'] = df['vendido'].map(mapa_vendido)

In [10]:
df.head()

Unnamed: 0,id,milhas_por_ano,ano_do_modelo,preco,vendido
0,0,21801,2000,30941.02,1
1,1,7843,1998,40557.96,1
2,2,7109,2006,89627.5,0
3,3,26823,2015,95276.14,0
4,4,7935,2014,117384.68,1


In [11]:
df.describe()

Unnamed: 0,id,milhas_por_ano,ano_do_modelo,preco,vendido
count,10000.0,10000.0,10000.0,10000.0,10000.0
mean,4999.5,14183.3912,2004.1442,64842.373698,0.58
std,2886.89568,5008.571422,4.68758,25250.592393,0.493583
min,0.0,363.0,1998.0,-840.36,0.0
25%,2499.75,10474.75,2000.0,44796.375,0.0
50%,4999.5,13418.5,2003.0,65225.685,1.0
75%,7499.25,17176.75,2007.0,85111.2375,1.0
max,9999.0,39572.0,2017.0,118929.72,1.0


In [12]:
from datetime import datetime 

In [13]:
df['idade_do_modelo'] = (datetime.today().year - df['ano_do_modelo'])

In [14]:
df.head()

Unnamed: 0,id,milhas_por_ano,ano_do_modelo,preco,vendido,idade_do_modelo
0,0,21801,2000,30941.02,1,20
1,1,7843,1998,40557.96,1,22
2,2,7109,2006,89627.5,0,14
3,3,26823,2015,95276.14,0,5
4,4,7935,2014,117384.68,1,6


In [15]:
df['km_por_ano'] = (df['milhas_por_ano']*1.60934)

In [16]:
df.head()

Unnamed: 0,id,milhas_por_ano,ano_do_modelo,preco,vendido,idade_do_modelo,km_por_ano
0,0,21801,2000,30941.02,1,20,35085.22134
1,1,7843,1998,40557.96,1,22,12622.05362
2,2,7109,2006,89627.5,0,14,11440.79806
3,3,26823,2015,95276.14,0,5,43167.32682
4,4,7935,2014,117384.68,1,6,12770.1129


In [17]:
df.drop(columns=['id', 'milhas_por_ano', 'ano_do_modelo'], inplace=True)

In [18]:
df.head()

Unnamed: 0,preco,vendido,idade_do_modelo,km_por_ano
0,30941.02,1,20,35085.22134
1,40557.96,1,22,12622.05362
2,89627.5,0,14,11440.79806
3,95276.14,0,5,43167.32682
4,117384.68,1,6,12770.1129


In [19]:
x = df[['idade_do_modelo', 'preco', 'km_por_ano']]
y = df[['vendido']]


raw_x_train, raw_x_test, y_train, y_test = train_test_split(x, y, random_state=30, test_size=0.2, stratify=y)

scaler = StandardScaler()
scaler.fit(raw_x_train)
x_train = scaler.transform(raw_x_train)
x_test = scaler.transform(raw_x_test)

model = SVC()

model.fit(x_train, y_train)

y_pred = model.predict(x_test)

acc = accuracy_score(y_test, y_pred)
print(f'A Acurácia é de {acc}')

  y = column_or_1d(y, warn=True)


A Acurácia é de 0.771


## Dummy Classifier

In [20]:
from sklearn.dummy import DummyClassifier

In [21]:
dummy = DummyClassifier()

dummy.fit(x_train, y_train)

y_pred = dummy.predict(x_test)

acc = accuracy_score(y_test, y_pred)
print(f'A Acurácia do Dummy é de {acc}')

A Acurácia do Dummy é de 0.521


## Decision Tree Classifier

In [22]:
from sklearn.tree import DecisionTreeClassifier

In [31]:
SEED = 123929
np.random.seed(SEED)
model = DecisionTreeClassifier(max_depth=3)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
acc = accuracy_score(y_test,y_pred)

print(f'A acurácia é {acc}')

A acurácia é 0.795


# Cross-Validation

In [216]:
from sklearn.model_selection import cross_validate, KFold, StratifiedKFold, GroupKFold

In [93]:
def get_accuracy_interval(x: pd.DataFrame, y: pd.DataFrame, max_depth:int=1, k_intervals:int=1, seed: int=None) -> None:
    if seed:
        SEED = 5
        np.random.seed(SEED)
        
    model = DecisionTreeClassifier(max_depth=max_depth)
    cv = KFold(n_splits=k_intervals, shuffle=True)
    results = cross_validate(model, x, y, cv=cv)
    
    result_mean = results['test_score'].mean()
    result_std = results['test_score'].std()
    result_interval = (result_mean - 2*result_std, result_mean + 2*result_std)
    print(f'Accuracy Mean {result_mean}')
    print(f'Accuracy interval {result_interval}')
    
    return None

In [94]:
get_accuracy_interval(x, y, 3, 3)

Accuracy Mean 0.7867995257834122
Accuracy interval (0.7766680003225641, 0.7969310512442602)


## Stratified KFold

In [98]:
def get_accuracy_interval(x: pd.DataFrame, y: pd.DataFrame, max_depth:int=1, k_intervals:int=1, seed: int=None) -> None:
    if seed:
        SEED = 5
        np.random.seed(SEED)
        
    model = DecisionTreeClassifier(max_depth=max_depth)
    cv = StratifiedKFold(n_splits=k_intervals, shuffle=True)
    results = cross_validate(model, x, y, cv=cv)
    
    result_mean = results['test_score'].mean()
    result_std = results['test_score'].std()
    result_interval = (result_mean - 2*result_std, result_mean + 2*result_std)
    print(f'Accuracy Mean {result_mean}')
    print(f'Accuracy interval {result_interval}')
    
    return None

In [100]:
get_accuracy_interval(x, y, 3, 3)

Accuracy Mean 0.7866001657154319
Accuracy interval (0.7836962185820237, 0.7895041128488401)


In [101]:
get_accuracy_interval(x, y, 3, 3)

Accuracy Mean 0.7870997657654187
Accuracy interval (0.7783632768958019, 0.7958362546350356)


In [215]:
df['modelo'] = df['idade_do_modelo'] + np.random.randint(-3, 4, len(df))
df['modelo'] = df['modelo'].apply(lambda x: abs(x) if x < 0 else x)

In [219]:
def get_accuracy_interval_with_groups(x: pd.DataFrame, y: pd.DataFrame, max_depth:int=1, k_intervals:int=1, seed: int=None, groups: pd.DataFrame=None) -> None:
    if seed:
        SEED = 5
        np.random.seed(SEED)
        
    model = DecisionTreeClassifier(max_depth=max_depth)
    cv = GroupKFold(n_splits=k_intervals)
    results = cross_validate(model, x, y, cv=cv, groups=groups)
    
    result_mean = results['test_score'].mean()
    result_std = results['test_score'].std()
    result_interval = (result_mean - 2*result_std, result_mean + 2*result_std)
    print(f'Accuracy Mean {result_mean}')
    print(f'Accuracy interval {result_interval}')
    
    return None

In [222]:
get_accuracy_interval_with_groups(x=x, y=y, max_depth:int=1, k_intervals=3, seed=3, groups=df['modelo'])

Accuracy Mean 0.7577125033497115
Accuracy interval (0.7494458314223474, 0.7659791752770756)


# Cross-Validation with StandardScaler

In [223]:
from sklearn.preprocessing import StandardScaler

In [226]:
scaler = StandardScaler()
scaler.fit(x_train)
scaled_x_train = scaler.transform(x_train)
scaled_x_test = scaler.transform(x_test)

In [228]:
from sklearn.svm import SVC

In [234]:
model = SVC()
model.fit(scaled_x_train, y_train)
y_pred = model.predict(scaled_x_test)

acc = accuracy_score(y_test, y_pred)
print(f'A Acurácia é de {acc} SEM CROSS-VALIDATION')

cv = GroupKFold(n_splits=10)
results = cross_validate(model, x, y, cv=cv, groups=df['modelo'])

result_mean = results['test_score'].mean()
result_std = results['test_score'].std()
result_interval = (result_mean - 2*result_std, result_mean + 2*result_std)
print(f'Accuracy Mean {result_mean} COM CROSS-VALIDATION')
print(f'Accuracy interval {result_interval} COM CROSS-VALIDATION)')

  y = column_or_1d(y, warn=True)


A Acurácia é de 0.771


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy Mean 0.5801744421462609
Accuracy interval (0.5452878831327843, 0.6150610011597375)


# Using Pipeline

In [235]:
from sklearn.pipeline import Pipeline

In [236]:
scaler = StandardScaler()
model = SVC()

In [241]:
def get_accuracy_interval_with_groups_with_pipeline(x: pd.DataFrame, 
                                                    y: pd.DataFrame, 
                                                    max_depth:int=1, 
                                                    k_intervals:int=1, 
                                                    seed: int=None, 
                                                    groups: pd.DataFrame=None) -> None:
    
    
    pipeline = Pipeline([
        ('transformacao', scaler),
        ('estimador', model)
    ])

    cv = GroupKFold(n_splits=5)
    results = cross_validate(pipeline, x, y, cv=cv, groups=groups)

    result_mean = results['test_score'].mean()
    result_std = results['test_score'].std()
    result_interval = (result_mean - 2*result_std, result_mean + 2*result_std)
    print(f'Accuracy Mean {result_mean}')
    print(f'Accuracy interval {result_interval}')

In [243]:
get_accuracy_interval_with_groups_with_pipeline(x=x, y=y, groups=df['modelo'])

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy Mean 0.7651021083461149
Accuracy interval (0.7411945559397486, 0.7890096607524812)
