# Load Required Packages

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import pickle

# Define Variables

In [2]:
Folder = "E:\\datasets\\car\\"

In [3]:
NumericalColumns = ["mileage", "engine_displacement", "engine_power"]
ConvertToStringColumns = ["stk_year", "door_count", "seat_count", "manufacture_year"]
PriceColumn = ["price_eur"]
StringColumns = [
    "type",  # Combination of Maker and Model
    "manufacture_year",
    "body_type",
    "stk_year",
    "transmission",
    "door_count",
    "seat_count",
    "fuel_type",
]

# Load & Convert to Parquet

In [4]:
try:
    Data = pd.read_parquet(f"{Folder}clean_df.parq")
except:
    Data = pd.read_csv(f"{Folder}clean_df.csv", low_memory=False)
    Data.to_parquet(f"{Folder}clean_df.parq")

# Make an Extra Copy

In [5]:
TData = Data.copy()

# Combine Maker and Model
Model is dependent on Maker. We shouldn't model maker separately.

In [6]:
TData["type"] = (
    TData["maker"].apply(lambda x: "" if pd.isnull(x) else x)
    + "_"
    + TData["model"].apply(lambda x: "" if pd.isnull(x) else x)
)

# Delete Any Unwanted Variables

In [7]:
UnWantedVariables = [] + ["date_created", "date_last_seen", "model", "maker"]
for x in UnWantedVariables:
    if x in TData.columns:
        del TData[x]

# Transform some columns to string

In [8]:
for c in ConvertToStringColumns:
    TData[c] = TData[c].apply(lambda x: str(x) if not pd.isnull(x) else x)

TData = TData[StringColumns + NumericalColumns + PriceColumn]

# View Result

In [21]:
TData['type'].value_counts().to_clipboard()

In [None]:
TData.dtypes

# Scale Numerical Columns

In [None]:
NumericalScaler = StandardScaler().fit(TData[NumericalColumns])
TData[NumericalColumns] = NumericalScaler.transform(TData[NumericalColumns])

In [None]:
with open("NumericalScaler.obj", "wb") as NS_File:
    pickle.dump(NumericalScaler, NS_File)

In [None]:
PriceScaler = StandardScaler().fit(TData[PriceColumn])
TData[PriceColumn] = PriceScaler.transform(TData[PriceColumn])

In [None]:
with open("PriceScaler.obj", "wb") as PS_File:
    pickle.dump(PriceScaler, PS_File)

# One Hot Encoding Categorical Variables

https://stackoverflow.com/questions/28465633/easy-way-to-apply-transformation-from-pandas-get-dummies-to-new-data

In [None]:
Encoded = pd.get_dummies(TData, columns=StringColumns)

In [None]:
HeadObject = Encoded.head(1)
with open("HeadObject.obj", "wb") as HO_File:
    pickle.dump(HeadObject, HO_File)

# Fill remaining nans with 0

In [None]:
Encoded = Encoded.fillna(0)

# Store tp Parquet

In [None]:
Encoded.head().to_parquet(f"{Folder}SampleEncoded.parq")

In [None]:
Encoded.to_parquet(f"{Folder}Encoded.parq")

In [None]:
X = Encoded[[x for x in Encoded.columns if x != "price_eur"]]
X.to_parquet(f"{Folder}X.parq")
y = Encoded[["price_eur"]]
y.to_parquet(f"{Folder}y.parq")