# Load Required Packages

In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import pickle

# Define Variables

In [2]:
Folder = "E:\\datasets\\car\\"

In [3]:
NumericalColumns = ["mileage", "engine_displacement", "engine_power"]
ConvertToStringColumns = ["stk_year", "door_count", "seat_count", "manufacture_year"]
PriceColumn = ["price_eur"]
StringColumns = [
    # "type",  # Combination of Maker and Model
    "maker",
    "model",
    "manufacture_year",
    "body_type",
    "stk_year",
    "transmission",
    "door_count",
    "seat_count",
    "fuel_type",
]

# Load & Convert to Parquet

In [4]:
Data = pd.read_csv(f"{Folder}clean_df.csv", low_memory=False)

# Make an Extra Copy

In [5]:
TData = Data.copy()

# Combine Maker and Model
Model is dependent on Maker. We shouldn't model maker separately.

In [6]:
TData["type"] = (
    TData["maker"].apply(lambda x: "" if pd.isnull(x) else x)
    + "_"
    + TData["model"].apply(lambda x: "" if pd.isnull(x) else x)
)

# Delete Any Unwanted Variables

In [7]:
UnWantedVariables = [] + ["date_created", "date_last_seen"]#, "model", "maker"
for x in UnWantedVariables:
    if x in TData.columns:
        del TData[x]

# Transform some columns to string

In [8]:
for c in ConvertToStringColumns:
    TData[c] = TData[c].apply(lambda x: str(x) if not pd.isnull(x) else x)

TData = TData[StringColumns + NumericalColumns + PriceColumn]

# View Result

In [9]:
TData.head()

Unnamed: 0,maker,model,manufacture_year,body_type,stk_year,transmission,door_count,seat_count,fuel_type,mileage,engine_displacement,engine_power,price_eur
0,ford,galaxy,2011.0,compact,,man,5.0,7.0,diesel,151000.0,2000.0,138.12506,10584.75
1,skoda,octavia,2012.0,compact,,man,5.0,5.0,diesel,143476.0,2000.0,108.62262,8882.31
2,bmw,,2010.0,compact,,man,5.0,5.0,diesel,97676.0,1995.0,113.9867,12065.06
3,skoda,fabia,2004.0,compact,,man,5.0,5.0,gasoline,111970.0,1200.0,84.48426,2960.77
4,skoda,fabia,2004.0,compact,,man,5.0,5.0,gasoline,128886.0,1200.0,84.48426,2738.71


In [10]:
TData.dtypes

maker                   object
model                   object
manufacture_year        object
body_type               object
stk_year                object
transmission            object
door_count              object
seat_count              object
fuel_type               object
mileage                float64
engine_displacement    float64
engine_power           float64
price_eur              float64
dtype: object

# Scale Numerical Columns

In [11]:
NumericalScaler = StandardScaler().fit(TData[NumericalColumns])
TData[NumericalColumns] = NumericalScaler.transform(TData[NumericalColumns])

In [12]:
with open("NumericalScaler.obj", "wb") as NS_File:
    pickle.dump(NumericalScaler, NS_File)

In [13]:
PriceScaler = StandardScaler().fit(TData[PriceColumn])
TData[PriceColumn] = PriceScaler.transform(TData[PriceColumn])

In [14]:
with open("PriceScaler.obj", "wb") as PS_File:
    pickle.dump(PriceScaler, PS_File)

# One Hot Encoding Categorical Variables

https://stackoverflow.com/questions/28465633/easy-way-to-apply-transformation-from-pandas-get-dummies-to-new-data

In [15]:
Encoded = pd.get_dummies(TData, columns=StringColumns)

In [16]:
HeadObject = Encoded.head(1)
with open("HeadObject.obj", "wb") as HO_File:
    pickle.dump(HeadObject, HO_File)

# Store tp Parquet

In [17]:
Encoded.head().to_parquet(f"{Folder}SampleEncoded.parq")

In [18]:
Encoded.to_parquet(f"{Folder}Encoded.parq")

In [19]:
X = Encoded[[x for x in Encoded.columns if x != "price_eur"]]
X.to_parquet(f"{Folder}X.parq")
y = Encoded[["price_eur"]]
y.to_parquet(f"{Folder}y.parq")