# Load Required Packages

In [1]:
# from sklearn.decomposition import PCA


In [2]:
import pandas as pd
import numpy as np
import gc

In [3]:
from sklearn.preprocessing import StandardScaler
from scipy import sparse
from scipy.sparse import hstack
from DummyMaker import GetDummies
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.decomposition import TruncatedSVD

from sklearn.linear_model import SGDRegressor



In [5]:
from sklearn.neighbors import KNeighborsRegressor

from sklearn.metrics import r2_score



In [6]:
import matplotlib.pyplot as plt

# Load Data

In [7]:
# Specify Required Variables
Folder = "E:\\datasets\\car\\"
NumericalColumns = ["mileage", "engine_displacement", "engine_power"]
ConvertToStringColumns = ["stk_year", "door_count", "seat_count", "manufacture_year"]
PriceColumn = ["price_eur"]
StringColumns = [
    "type",  # Combination of Maker and Model
    "manufacture_year",
    "body_type",
    "stk_year",
    "transmission",
    "door_count",
    "seat_count",
    "fuel_type",
]

# Try Reading Parquet for fast read.
try:
    Data = pd.read_parquet(f"{Folder}clean_df.parq")
except:
    Data = pd.read_csv(f"{Folder}clean_df.csv", low_memory=False)
    Data.to_parquet(f"{Folder}clean_df.parq")

# Prepare

In [8]:
# Create Data Copy
TData = Data.copy()
TData["type"] = (
    TData["maker"].apply(lambda x: "" if pd.isnull(x) else x)
    + "_"
    + TData["model"].apply(lambda x: "" if pd.isnull(x) else x)
)

# Remove Unwanted Variables
UnWantedVariables = [] + ["date_created", "date_last_seen", "model", "maker"]
for x in UnWantedVariables:
    if x in TData.columns:
        del TData[x]

# Get rid of some columns
for c in ConvertToStringColumns:
    TData[c] = TData[c].apply(lambda x: str(x) if not pd.isnull(x) else x)

# Reorder
TData = TData[StringColumns + NumericalColumns + PriceColumn]

# Train Test Split

In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    TData[StringColumns + NumericalColumns],
    TData[PriceColumn],
    test_size=0.20,
    random_state=1991,
)

# Scaling

In [10]:
# Scale Numerical Columns
NumericalScaler = StandardScaler().fit(X_train[NumericalColumns])
X_train[NumericalColumns] = NumericalScaler.transform(X_train[NumericalColumns])

# Scale Price
PriceScaler = StandardScaler().fit(y_train)
y_train = PriceScaler.transform(y_train).flatten()

# Scale Test Data
X_test[NumericalColumns] = NumericalScaler.transform(X_test[NumericalColumns])
y_test= PriceScaler.transform(y_test).flatten()

# One Hot Encoding

In [11]:
# One Hot Encoding Categorical Variables
# https://dantegates.github.io/2018/05/04/a-fast-one-hot-encoder-with-sklearn-and-pandas.html
Encoder = GetDummies().fit(X_train[StringColumns])

# Store Column Names
ColumnNames = list(Encoder.final_columns) + NumericalColumns

# Transform Training
Coded_X_train = Encoder.transform(X_train[StringColumns])

# Transform Testing
Coded_X_test = Encoder.transform(X_test[StringColumns])

# Convert Everything to Sparse Format

In [12]:
# Underlying function returns sparse data

Coded_X_train = Coded_X_train.sparse.to_coo().tocsr()

Coded_X_train_NAN = Coded_X_train.copy()

Coded_X_test = Coded_X_test.sparse.to_coo().tocsr()

Coded_X_test_NAN = Coded_X_test.copy()


# Add Numerical Columns too
for c in NumericalColumns:
    Coded_X_train = hstack([Coded_X_train, sparse.csr_matrix(X_train[c].fillna(0)).T])

    Coded_X_train_NAN = hstack([Coded_X_train_NAN, sparse.csr_matrix(X_train[c]).T])

    Coded_X_test = hstack([Coded_X_test, sparse.csr_matrix(X_test[c].fillna(0)).T])

    Coded_X_test_NAN = hstack([Coded_X_test_NAN, sparse.csr_matrix(X_test[c]).T])

# Free Memory

In [13]:
del Data,TData,X_test,X_train

In [14]:
gc.collect()

52845

# Online Learning with SGDRegressor

In [22]:
gc.collect()

tsvd = TruncatedSVD(n_components=6,random_state=1991).fit(Coded_X_train)
display(tsvd.explained_variance_)

ReducedTrain = tsvd.transform(Coded_X_train)
ReducedTest = tsvd.transform(Coded_X_test)

X_parts = np.vsplit(ReducedTrain,8)
y_parts = np.vsplit(y_train.reshape((y_train.shape[0],1)),8)

sgd = SGDRegressor(random_state=1991)

for i,x in enumerate(X_parts):
    sgd.partial_fit(x,y_parts[i].flatten())

y_pred = sgd.predict(ReducedTest)

Comparison = pd.concat(
    [pd.DataFrame(y_pred), pd.DataFrame(y_test)],
    axis=1,
)

Comparison = pd.concat(
    [pd.DataFrame(y_pred), pd.DataFrame(y_test)],
    axis=1,
)
Comparison = pd.DataFrame(
    PriceScaler.inverse_transform(Comparison), columns=["Predicted", "Price"]
)
print(len(Comparison))
display(Comparison.head())
r2_score(Comparison["Price"], Comparison["Predicted"])

array([1.40080402, 0.5965793 , 0.98412273, 0.36599672, 0.32208589,
       0.30134911, 0.24730033, 0.20844189, 0.18130061, 0.14121844,
       0.11738998, 0.10872461, 0.08431483, 0.08180115, 0.07915188,
       0.07037344, 0.0699454 , 0.06670425, 0.06286014, 0.05917537,
       0.05349643, 0.05108214, 0.04964976, 0.04850583, 0.04705939])

566895


Unnamed: 0,Predicted,Price
0,8340.832875,1900.0
1,13476.299437,13851.04
2,8579.668409,8393.19
3,19235.647413,16494.0
4,13577.392446,6250.0


0.6628859643206464

# K Neighbors Regressor With Truncated SVD

In [20]:
gc.collect()

tsvd = TruncatedSVD(n_components=6,random_state=1991).fit(Coded_X_train)
display(tsvd.explained_variance_)

ReducedTrain = tsvd.transform(Coded_X_train)
ReducedTest = tsvd.transform(Coded_X_test)

kn = KNeighborsRegressor(25, weights="distance")
kn.fit(ReducedTrain, y_train)
y_pred = kn.predict(ReducedTest)


Comparison = pd.concat(
    [pd.DataFrame(y_pred), pd.DataFrame(y_test)],
    axis=1,
)

Comparison = pd.concat(
    [pd.DataFrame(y_pred), pd.DataFrame(y_test)],
    axis=1,
)
Comparison = pd.DataFrame(
    PriceScaler.inverse_transform(Comparison), columns=["Predicted", "Price"]
)
print(len(Comparison))
display(Comparison.head())
r2_score(Comparison["Price"], Comparison["Predicted"])

array([1.40225702, 0.59461437, 0.98411756, 0.36609849, 0.32189105,
       0.30154424, 0.24753535, 0.20815263, 0.18138634, 0.14140372])

566895


Unnamed: 0,Predicted,Price
0,1900.0,1900.0
1,12674.47361,13851.04
2,7672.058044,8393.19
3,16494.0,16494.0
4,6515.383281,6250.0


0.8884038388144765

# References

## Scientific

https://stats.stackexchange.com/questions/256172/why-is-dimensionality-reduction-always-done-before-clustering

https://community.databricks.com/t5/machine-learning/do-one-hot-encoding-ohe-before-or-after-split-data-to-train-and/td-p/17888#:~:text=%22If%20you%20perform%20the%20encoding,scores%20but%20poor%20in%20deployment).

https://datascience.stackexchange.com/questions/107714/encoding-before-vs-after-train-test-split

https://stats.stackexchange.com/questions/599508/do-we-one-hot-encode-create-dummy-variables-before-or-after-train-test-split

https://stats.stackexchange.com/questions/142216/zero-centering-the-testing-set-after-pca-on-the-training-set

https://stats.stackexchange.com/questions/55718/pca-and-the-train-test-split

https://stackoverflow.com/questions/55441022/how-to-aply-the-same-pca-to-train-and-test-set

## Coding

https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html

https://stackoverflow.com/questions/33603787/performing-pca-on-large-sparse-matrix-by-using-sklearn

https://stackoverflow.com/questions/26576524/how-do-i-transform-a-scipy-sparse-matrix-to-a-numpy-matrix

https://stackoverflow.com/questions/20459536/convert-pandas-dataframe-to-sparse-numpy-matrix-directly

https://dantegates.github.io/2018/05/04/a-fast-one-hot-encoder-with-sklearn-and-pandas.html

https://pandas.pydata.org/docs/reference/api/pandas.SparseDtype.html

https://pandas.pydata.org/docs/reference/api/pandas.arrays.SparseArray.html



# PCA Vs Truncated SVD on KNeighborsRegressor(25, weights="distance")

PCA doesn't work on sparse Data.

2 Componenets
PCA yielded higher Accuracy at 2 components of 0.8178726424680244 vs 0.623305478122284 of Truncated SVD using seed 1991.

6 Componenets
PCA yielded Accuracy at 6 components of 0.8832974168967355 vs 0.8830232140933794 of Truncated SVD using seed 1991.

PCA Explained Variance ([1.79120925, 0.98414093, 0.389263  , 0.33659156, 0.30127252,
       0.27814113]) in 1m 26s

TSVD explained Variance ([1.39979887, 0.5979397 , 0.98411531, 0.36582389, 0.32243568,
       0.30120099]) in 22s

Adding more than 6 components yields very low improvement.

Isomap too slow.

In [17]:
# # K Neighbors Regressor With PCA

# gc.collect()

# DenseTraining =Coded_X_train.toarray()

# pca = PCA(n_components=6,random_state=1991).fit(DenseTraining)
# display(pca.explained_variance_)

# ReducedTrain = pca.transform(DenseTraining)
# ReducedTest = pca.transform(Coded_X_test.toarray())

# kn = KNeighborsRegressor(25, weights="distance")
# kn.fit(ReducedTrain, y_train)
# y_pred = kn.predict(ReducedTest)


# Comparison = pd.concat(
#     [pd.DataFrame(y_pred), pd.DataFrame(y_test)],
#     axis=1,
# )

# Comparison = pd.concat(
#     [pd.DataFrame(y_pred), pd.DataFrame(y_test)],
#     axis=1,
# )
# Comparison = pd.DataFrame(
#     PriceScaler.inverse_transform(Comparison), columns=["Predicted", "Price"]
# )
# print(len(Comparison))
# display(Comparison.head())
# r2_score(Comparison["Price"], Comparison["Predicted"])

# Clustiner

DBSCAN, HDBSCAN (Very slow)

# SGD Regrossor

0.7273975600580911 accuracy on raw data using seed 1991

0.6312 on Truncated SVD data with 6 components.