<a href="https://colab.research.google.com/github/ncorreia47/data_engineering/blob/main/vehicle_manufacturing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [194]:
import kagglehub
import os
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import LabelEncoder
from scipy.stats import randint, uniform

In [195]:
def colors_ansi(color_name):

    colors = {
        'black': '\033[30m',
        'red': '\033[31m',
        'green': '\033[32m',
        'yellow': '\033[33m',
        'blue': '\033[34m',
        'magenta': '\033[35m',
        'cyan': '\033[36m',
        'white': '\033[37m',
        'gray': '\033[90m',
        'light_red': '\033[91m',
        'light_green': '\033[92m',
        'light_yellow': '\033[93m',
        'light_blue': '\033[94m',
        'light_magenta': '\033[95m',
        'light_cyan': '\033[96m',
        'light_white': '\033[97m'
    }

    return colors[color_name] if color_name in colors else colors['light_white']

In [196]:
def raw_data(origin_dataset) -> pd.DataFrame:
    # Download latest version
    print(f'{colors_ansi("green")}{">" * 5} DOWNLOADING...')

    try:
        df = pd.read_csv(kagglehub.dataset_download(origin_dataset, path='Car Data.csv'))
        print(f'{colors_ansi("green")}{">" * 5} FILE DOWNLOADED AND DATAFRAME CREATED!{colors_ansi("reset")}')
        print(f'{colors_ansi("green")}{">" * 5} PROCESS FINISHED!{colors_ansi("reset")}')
        return df

    except Exception as error:
        print(f'{colors_ansi("magenta")}FILE NOT FOUND!{colors_ansi("reset")}')
        raise error

In [197]:
origin_dataset='arnavsmayan/vehicle-manufacturing-dataset'
raw_data(origin_dataset)

[32m>>>>> DOWNLOADING...
[32m>>>>> FILE DOWNLOADED AND DATAFRAME CREATED![97m
[32m>>>>> PROCESS FINISHED![97m


Unnamed: 0,Car ID,Brand,Model,Year,Color,Mileage,Price,Location
0,1,Toyota,Camry,2018,White,45000,18000,Los Angeles
1,2,Honda,Civic,2019,Blue,35000,16000,New York
2,3,Ford,Focus,2017,Silver,55000,14000,Chicago
3,4,Chevrolet,Cruze,2016,Red,60000,12000,Miami
4,5,Hyundai,Elantra,2018,Black,40000,15000,San Francisco
...,...,...,...,...,...,...,...,...
1995,1996,Hyundai,Palisade,2019,Silver,65000,22000,San Francisco
1996,1997,Toyota,Sienna,2018,Red,55000,16000,Dallas
1997,1998,Honda,Fit,2018,Gray,50000,14000,Atlanta
1998,1999,Ford,Fusion,2017,White,55000,19000,Phoenix


In [198]:
def create_new_sample(df=pd.DataFrame(), sample_limit=10) -> pd.DataFrame():

    print(f'{colors_ansi("cyan")}{">" * 5} INITIALIZING SAMPLE CREATION...{colors_ansi("reset")}')
    # Create unique values list
    car_info = df[["Car ID", "Brand", "Model"]].drop_duplicates()
    color_info = df[["Color"]].drop_duplicates()
    location_info = df[["Location"]].drop_duplicates()
    mileage_values = df[["Mileage"]].drop_duplicates()
    price_values = df[["Price"]].drop_duplicates()

    # Randomize data
    print(f'{colors_ansi("cyan")}{"*" * 10} RANDOMIZING DATA...{colors_ansi("reset")}')
    cars_samples = car_info.iloc[np.random.randint(0, high=len(car_info), size=sample_limit, dtype=int)]
    colors_samples = color_info.iloc[np.random.randint(0, high=len(color_info), size=sample_limit, dtype=int)]
    location_samples = location_info.iloc[np.random.randint(0, high=len(location_info), size=sample_limit, dtype=int)]
    mileage_values = mileage_values.iloc[np.random.randint(0, high=len(mileage_values), size=sample_limit, dtype=int)]
    price_values = price_values.iloc[np.random.randint(0, high=len(price_values), size=sample_limit, dtype=int)]
    print(f'{colors_ansi("cyan")}{"*" * 10} RANDOMIZATION COMPLETED!{colors_ansi("reset")}')

    # Create sample dataset
    print(f'{colors_ansi("cyan")}{"*" * 10} CREATING SAMPLE DATASET...{colors_ansi("reset")}')
    sample = pd.concat([cars_samples.reset_index(drop=True),
                     colors_samples.reset_index(drop=True),
                     location_samples.reset_index(drop=True),
                     mileage_values.reset_index(drop=True),
                     price_values.reset_index(drop=True)], axis=1)
    print(f'{colors_ansi("green")}{">" * 5} SAMPLE DATASET CREATED! SAMPLE LENGTH: {len(sample)}{colors_ansi("reset")}')

    return sample

In [199]:
sample = create_new_sample(df=raw_data(origin_dataset), sample_limit=120000)
sample

[32m>>>>> DOWNLOADING...
[32m>>>>> FILE DOWNLOADED AND DATAFRAME CREATED![97m
[32m>>>>> PROCESS FINISHED![97m
[36m>>>>> INITIALIZING SAMPLE CREATION...[97m
[36m********** RANDOMIZING DATA...[97m
[36m********** RANDOMIZATION COMPLETED![97m
[36m********** CREATING SAMPLE DATASET...[97m
[32m>>>>> SAMPLE DATASET CREATED! SAMPLE LENGTH: 120000[97m


Unnamed: 0,Car ID,Brand,Model,Color,Location,Mileage,Price
0,1687,Ford,Mustang,Yellow,San Francisco,40000,12000
1,1072,Hyundai,Sonata,Gray,San Francisco,70000,24000
2,917,Toyota,Camry,Silver,Dallas,50000,20000
3,321,Chevrolet,Malibu,Blue,New York,35000,26000
4,1614,Ford,Mustang,Yellow,Miami,40000,26000
...,...,...,...,...,...,...,...
119995,1937,Chevrolet,Cruze,Yellow,San Francisco,25000,23000
119996,839,Toyota,Prius,Silver,Miami,50000,12000
119997,379,Chevrolet,Equinox,White,San Francisco,70000,23000
119998,184,Ford,Fiesta,Yellow,Chicago,70000,27000


In [200]:
# Normalization
# Calcular média e desvio padrão
mean_price = sample['Price'].mean()
std_price = sample['Price'].std()

mean_mileage = sample['Mileage'].mean()
std_mileage = sample['Mileage'].std()

# Definir os intervalos com base no desvio padrão
# 1 - 'Below Average'
# 2 - 'Low'
# 3 - 'Average'
# 4 - 'High'

bins_price = [0, mean_price - std_price, mean_price, mean_price + std_price, sample['Price'].max()]
bins_mileage = [0, mean_mileage - std_mileage, mean_mileage, mean_mileage + std_mileage, sample['Mileage'].max()]
labels = ['Below Average', 'Low', 'Average', 'High']

# Atribuir as faixas ao dataframe
sample['price_range'] = pd.cut(sample['Price'], bins=bins_price, labels=labels)
sample['mileage_range'] = pd.cut(sample['Mileage'], bins=bins_mileage, labels=labels)
sample

Unnamed: 0,Car ID,Brand,Model,Color,Location,Mileage,Price,price_range,mileage_range
0,1687,Ford,Mustang,Yellow,San Francisco,40000,12000,Below Average,Low
1,1072,Hyundai,Sonata,Gray,San Francisco,70000,24000,Average,High
2,917,Toyota,Camry,Silver,Dallas,50000,20000,Low,Average
3,321,Chevrolet,Malibu,Blue,New York,35000,26000,High,Low
4,1614,Ford,Mustang,Yellow,Miami,40000,26000,High,Low
...,...,...,...,...,...,...,...,...,...
119995,1937,Chevrolet,Cruze,Yellow,San Francisco,25000,23000,Average,Below Average
119996,839,Toyota,Prius,Silver,Miami,50000,12000,Below Average,Average
119997,379,Chevrolet,Equinox,White,San Francisco,70000,23000,Average,High
119998,184,Ford,Fiesta,Yellow,Chicago,70000,27000,High,High


In [201]:
# Convertendo variáveis categóricas para números (usando LabelEncoder)
label_encoder = LabelEncoder()

sample['Brand'] = label_encoder.fit_transform(sample['Brand'])
sample['Model'] = label_encoder.fit_transform(sample['Model'])
sample['Color'] = label_encoder.fit_transform(sample['Color'])
sample['Location'] = label_encoder.fit_transform(sample['Location'])

In [202]:
# 3. Modelagem
# Separando as variáveis independentes (X) e dependente (y)
X = sample[['Brand', 'Model', 'Mileage']]
y = sample['Price']

# Dividindo o conjunto de dados em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Definir o espaço de parâmetros
param_dist = {
    'n_estimators': randint(100, 500),                                          # Mais árvores para melhorar o modelo
    'max_depth': randint(10, 50),                                               # Aumentar a profundidade máxima das árvores
    'min_samples_split': randint(2, 20),                                        # Testar valores maiores para split
    'min_samples_leaf': randint(1, 10),                                         # Testar valores maiores para folha
    'bootstrap': [True, False],                                                 # Usar amostragem com reposição ou não
    'max_features': ['auto', 'sqrt', 'log2'],                                   # Experimentar diferentes formas de escolher as features
    'class_weight': [None, 'balanced'],                                         # Tentar balancear as classes (se necessário)
}

rf_model = RandomForestClassifier(random_state=42)

random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist,                                             # Distribuição de hiperparâmetros
    n_iter=100,                                                                 # Número de iterações (número de combinações aleatórias a serem testadas)
    cv=5,                                                                       # Número de dobras para validação cruzada
    n_jobs=-1,                                                                  # Usar todos os núcleos do processador
    verbose=2,                                                                  # Exibir detalhes do processo de busca
    random_state=42,
    scoring='accuracy'
)

random_search.fit(X_train, y_train)

# Mostrar os melhores hiperparâmetros encontrados
print("Melhores hiperparâmetros:", random_search.best_params_)

# Avaliar o modelo com os dados de teste
best_model = random_search.best_estimator_
test_accuracy = best_model.score(X_test, y_test)
print(f'Acurácia do modelo otimizado: {test_accuracy:.2f}')

Fitting 5 folds for each of 100 candidates, totalling 500 fits
