# Installing required libraries

In [None]:
%pip install numpy
%pip install pandas
%pip install matplotlib
%pip install seaborn
%pip install scipy

In [None]:
from DataPreprocessor import DataPreprocessor

import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

In [None]:
path = './data/'
data = pd.read_csv(path + 'train.csv')


In [None]:
print(f'Số lượng dòng của data: {len(data)}')
print(data.columns)

In [None]:
data.head()

In [None]:
data.info()

# I. Preprocessing Data

## 1. Cleaning our data

-

In [None]:
data.dropna(how='all')
data.drop_duplicates()

data["Engine"] = data["Engine"].str.replace(' cc', '', regex=False).astype(float)

data[['Max Power BHP', 'Max Power RPM']] = data['Max Power'].str.extract(r'(\d+)\s*bhp\s*@\s*(\d+)\s*rpm')
data['Max Power BHP'] = pd.to_numeric(data['Max Power BHP'], errors='coerce')
data['Max Power RPM'] = pd.to_numeric(data['Max Power RPM'], errors='coerce')

data[['Max Torque Nm', 'Max Torque RPM']] = data['Max Torque'].str.extract(r'(\d+)\s*Nm\s*@\s*(\d+)\s*rpm')
data['Max Torque Nm'] = pd.to_numeric(data['Max Torque Nm'], errors='coerce')
data['Max Torque RPM'] = pd.to_numeric(data['Max Torque RPM'], errors='coerce')

data = data.drop('Max Power', axis=1)
data = data.drop('Max Torque', axis=1)

In [None]:
data.isna().sum()

## 2. Dealing with missing values

In [None]:
preprocessor = DataPreprocessor(data)

### 2.1. Filling with mean

In [None]:
mean_columns = []
preprocessor.fill_mean(mean_columns)

data.isna().sum()

### 2.2. Filling with median

In [None]:
median_columns = []
preprocessor.fill_median(median_columns)

data.isna().sum()

### 2.3. Filling with the most common value

In [None]:
mode_columns = []
preprocessor.fill_mode(mode_columns)

data.isna().sum()

### 2.4. Filling using KNN (K-nearest neighbour)

In [None]:
knn_columns = []
preprocessor.fill_knn(knn_columns)

data.isna().sum()

## 3. Encoding categorical values

### 3.1. Ordinal encoding

In [None]:
ordinal_columns = []
preprocessor.ordinal_encode(ordinal_columns)

data

### 3.2. One-hot encoding

In [None]:
one_hot_columns = []
threshold = 0 # should be an integer
preprocessor.one_hot_encode(one_hot_columns, threshold)

data

## 4. Scaling our data

plot

In [None]:
import math

def visualize_relationship_with_a_variable(df: pd.DataFrame, col: str=None):
    numeric_data = df.select_dtypes(include=['number'])

    correlations = numeric_data.corr()[col]
    num_cols = [c for c in numeric_data.columns if c != "Price"]

    num_plots = len(num_cols)
    cols_per_row = 3
    num_rows = math.ceil(num_plots / cols_per_row)

    fig, axes = plt.subplots(num_rows, cols_per_row, figsize=(5 * cols_per_row, 5 * num_rows))

    axes = axes.flatten()

    for i, col in enumerate(num_cols):
        corr_value = correlations[col]
        sns.scatterplot(x=numeric_data[col], y=numeric_data["Price"], ax=axes[i])
        axes[i].set_title(f"{col} vs Price (corr={corr_value:.2f})")
        axes[i].set_xlabel(col)
        axes[i].set_ylabel("Price")

    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()

In [None]:
visualize_relationship_with_a_variable(data, 'Price')

### 4.1. Log transformation

In [None]:
log_column = []
preprocessor.log_norm(log_column)

data

### 4.2. Min-Max scaling

In [None]:
minmax_columns = []
preprocessor.minmax_norm(minmax_columns)

data

### 4.3. Standardization

In [None]:
std_column = []
preprocessor.standard_norm(std_column)

data

### 4.4 Robust scaling

In [None]:
robust_column = []
focus = 0.5
preprocessor.robust_norm(robust_column, focus)

data

vis

In [None]:
visualize_relationship_with_a_variable(data, 'Price')