# Load Required Packages

In [24]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import pickle
from scipy import sparse
from scipy.sparse import hstack
from DummyMaker import GetDummies

# Define Variables

In [25]:
Folder = "E:\\datasets\\car\\"

In [26]:
NumericalColumns = ["mileage", "engine_displacement", "engine_power"]
ConvertToStringColumns = ["stk_year", "door_count", "seat_count", "manufacture_year"]
PriceColumn = ["price_eur"]
StringColumns = [
    "type",  # Combination of Maker and Model
    "manufacture_year",
    "body_type",
    "stk_year",
    "transmission",
    "door_count",
    "seat_count",
    "fuel_type",
]

# Load & Convert to Parquet

In [27]:
Data = pd.read_csv(f"{Folder}clean_df.csv", low_memory=False)

# Make an Extra Copy

In [28]:
TData = Data.copy()

# Combine Maker and Model
Model is dependent on Maker. We shouldn't model maker separately.

In [29]:
TData["type"] = (
    TData["maker"].apply(lambda x: "" if pd.isnull(x) else x)
    + "_"
    + TData["model"].apply(lambda x: "" if pd.isnull(x) else x)
)

# Delete Any Unwanted Variables

In [30]:
UnWantedVariables = [] + ["date_created", "date_last_seen", "model", "maker"]
for x in UnWantedVariables:
    if x in TData.columns:
        del TData[x]

# Transform some columns to string

In [31]:
for c in ConvertToStringColumns:
    TData[c] = TData[c].apply(lambda x: str(x) if not pd.isnull(x) else x)

TData = TData[StringColumns + NumericalColumns + PriceColumn]

# View Result

In [32]:
TData.head()

Unnamed: 0,type,manufacture_year,body_type,stk_year,transmission,door_count,seat_count,fuel_type,mileage,engine_displacement,engine_power,price_eur
0,ford_galaxy,2011.0,compact,,man,5.0,7.0,diesel,151000.0,2000.0,138.12506,10584.75
1,skoda_octavia,2012.0,compact,,man,5.0,5.0,diesel,143476.0,2000.0,108.62262,8882.31
2,bmw_,2010.0,compact,,man,5.0,5.0,diesel,97676.0,1995.0,113.9867,12065.06
3,skoda_fabia,2004.0,compact,,man,5.0,5.0,gasoline,111970.0,1200.0,84.48426,2960.77
4,skoda_fabia,2004.0,compact,,man,5.0,5.0,gasoline,128886.0,1200.0,84.48426,2738.71


In [33]:
TData.dtypes

type                    object
manufacture_year        object
body_type               object
stk_year                object
transmission            object
door_count              object
seat_count              object
fuel_type               object
mileage                float64
engine_displacement    float64
engine_power           float64
price_eur              float64
dtype: object

# Scale Numerical Columns

In [34]:
NumericalScaler = StandardScaler().fit(TData[NumericalColumns])
TData[NumericalColumns] = NumericalScaler.transform(TData[NumericalColumns])

In [35]:
with open("NumericalScaler.obj", "wb") as NS_File:
    pickle.dump(NumericalScaler, NS_File)

In [36]:
PriceScaler = StandardScaler().fit(TData[PriceColumn])
TData[PriceColumn] = PriceScaler.transform(TData[PriceColumn])

In [37]:
with open("PriceScaler.obj", "wb") as PS_File:
    pickle.dump(PriceScaler, PS_File)

# One Hot Encoding Categorical Variables
https://dantegates.github.io/2018/05/04/a-fast-one-hot-encoder-with-sklearn-and-pandas.html

In [38]:
Encoder = GetDummies().fit(TData[StringColumns])
tran = Encoder.transform(TData[StringColumns])

In [39]:
with open("Encoder.obj", "wb") as Ec_File:
    pickle.dump(Encoder, Ec_File)

# Convert Everything to Sparse Format

In [40]:
# Underlying function returns sparse data
X = tran.sparse.to_coo().tocsr()
X_WithOutNA = tran.sparse.to_coo().tocsr()

In [41]:
# Add Numerical Columns too
for x in NumericalColumns:
    column = sparse.csr_matrix(TData[x]).T
    columnWithoutNA =sparse.csr_matrix(TData[x].fillna(0)).T
    X=hstack([X,column])
    X_WithOutNA=hstack([X_WithOutNA,columnWithoutNA])

In [42]:
y = sparse.csr_matrix(TData['price_eur']).T

# Store to Sparse Data

In [43]:
ColumnNames = list(Encoder.final_columns)+NumericalColumns+['price_eur']
with open("ColumnNames.obj", "wb") as CN_File:
    pickle.dump(ColumnNames, CN_File)

In [44]:
sparse.save_npz(f'{Folder}X_WithOutNA.npz',X_WithOutNA)

In [22]:
sparse.save_npz(f'{Folder}X.npz',X)

In [23]:
sparse.save_npz(f'{Folder}y.npz',y)