# Installing required libraries

In [None]:
# %pip install numpy
# %pip install pandas
# %pip install matplotlib
# %pip install seaborn
# %pip install scipy

In [None]:
from preprocessor.DataPreprocessor import DataPreprocessor
from modelling.SplitDataset import SplitDataset
from preprocessor.sklearn_preprocessor import preprocess_data

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [None]:
path = './data/'
data = pd.read_csv(path + 'train.csv')

In [None]:
print(f'Số lượng dòng của data: {len(data)}')
print(data.columns)

In [None]:
data.head()

In [None]:
data.info()

# I. Preprocessing Data

## 1. Cleaning our data

-

In [None]:
data.dropna(how='all')
data.drop_duplicates()

data["Engine"] = data["Engine"].str.replace(' cc', '', regex=False).astype(float)

data[['Max Power BHP', 'Max Power RPM']] = data['Max Power'].str.extract(r'(\d+)\s*bhp\s*@\s*(\d+)\s*rpm')
data['Max Power BHP'] = pd.to_numeric(data['Max Power BHP'], errors='coerce')
data['Max Power RPM'] = pd.to_numeric(data['Max Power RPM'], errors='coerce')

data[['Max Torque Nm', 'Max Torque RPM']] = data['Max Torque'].str.extract(r'(\d+)\s*Nm\s*@\s*(\d+)\s*rpm')
data['Max Torque Nm'] = pd.to_numeric(data['Max Torque Nm'], errors='coerce')
data['Max Torque RPM'] = pd.to_numeric(data['Max Torque RPM'], errors='coerce')

data = data.drop('Max Power', axis=1)
data = data.drop('Max Torque', axis=1)

spltio


In [None]:
data["Drivetrain"].value_counts()

In [None]:
data["Fuel Type"].value_counts()

In [None]:
# Get the unique values before making changes
print("Unique values before:", data["Fuel Type"].unique())

# Replace less common fuel types with 'Others'
rare_fuel_types = ['Electric', 'LPG', 'Hybrid', 'CNG + CNG', 'Petrol + LPG']
data["Fuel Type"] = data["Fuel Type"].apply(lambda x: 'Others' if x in rare_fuel_types else x)

# Check the unique values after making changes
print("Unique values after:", data["Fuel Type"].unique())

In [None]:
# Apply preprocessing
train_df, val_df, test_df, preprocessor = preprocess_data(
    data=data, 
    save_path='./processed_data/'
)

In [None]:
train_df.to_csv(path + 'train_.csv', index=False)
val_df.to_csv(path + 'val_.csv', index=False)
test_df.to_csv(path + 'test_.csv', index=False)

## 2. Dealing with missing values

In [None]:
preprocessor = DataPreprocessor(data)

data.isna().sum()

In [None]:
data.columns[data.isnull().any()]

### 2.1. Filling with mean

In [None]:
mean_columns = ['Engine', 'Length', 'Width', 'Height', 'Seating Capacity', 'Fuel Tank Capacity', 'Max Power BHP', 'Max Power RPM', 'Max Torque Nm','Max Torque RPM']
data = preprocessor.fill_mean(mean_columns)

data.isna().sum()

### 2.2. Filling with median

In [None]:
median_columns = []
data = preprocessor.fill_median(median_columns)

data.isna().sum()

### 2.3. Filling with the most common value

In [None]:
mode_columns = []
data = preprocessor.fill_mode(mode_columns)

data.isna().sum()

### 2.4. Filling using KNN (K-nearest neighbour)

In [None]:
knn_columns = ['Drivetrain']
data = preprocessor.fill_knn(columns=knn_columns, k=5)

data.isna().sum()

## 3. Encoding categorical values

### 3.1. Ordinal encoding

In [None]:
preprocessor = DataPreprocessor(data)
ata = preprocessor.ordinal_encode(col='Owner', to={
    "UnRegistered Car": 0,
    "First": 1,
    "Second": 2,
    "Third": 3,
    "Fourth": 4,
    "4 or More": 4
})

data

### 3.2. One-hot encoding

In [None]:
data

In [None]:
one_hot_columns = ['Drivetrain', 'Fuel Type']
threshold = 0 # should be an integer
data = preprocessor.one_hot_encode(columns=one_hot_columns, threshold=threshold)

data

## 4. Scaling our data

plot

In [None]:
import math

def visualize_relationship_with_a_variable(df: pd.DataFrame, col: str=None):
    numeric_data = df.select_dtypes(include=['number'])

    correlations = numeric_data.corr()[col]
    num_cols = [c for c in numeric_data.columns if c != "Price"]

    num_plots = len(num_cols)
    cols_per_row = 3
    num_rows = math.ceil(num_plots / cols_per_row)

    fig, axes = plt.subplots(num_rows, cols_per_row, figsize=(5 * cols_per_row, 5 * num_rows))

    axes = axes.flatten()

    for i, col in enumerate(num_cols):
        corr_value = correlations[col]
        sns.scatterplot(x=numeric_data[col], y=numeric_data["Price"], ax=axes[i])
        axes[i].set_title(f"{col} vs Price (corr={corr_value:.2f})")
        axes[i].set_xlabel(col)
        axes[i].set_ylabel("Price")

    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()

In [None]:
visualize_relationship_with_a_variable(data, 'Price')

### 4.1. Log transformation

In [None]:
log_column = ['Price', 'Max Torque RPM'] ## vi data['Price'] co gia tri dac biet (cao bat thuong)
data = preprocessor.log_norm(log_column)

data

### 4.2. Min-Max scaling

In [None]:
minmax_columns = ['Year', 'Kilometer']
data = preprocessor.minmax_norm(minmax_columns)

data

### 4.3. Standardization

In [None]:
std_column = ['Max Power RPM', 'Owner']
data = preprocessor.standard_norm(std_column)

data

### 4.4 Robust scaling

In [None]:
robust_column = []
focus = 0.5
data = preprocessor.robust_norm(robust_column, focus)

data

vis

In [None]:
visualize_relationship_with_a_variable(data, 'Price')

In [None]:
data.to_csv(path + 'preprocessed_train_.csv', index=False)