# Load Required Packages

In [42]:
import pandas as pd
import numpy as np
import gc
import pickle
from scipy import sparse

In [43]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cluster import HDBSCAN
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LinearRegression

# Load Helpers

In [44]:
with open("NumericalScaler.obj", "rb") as NS_File:
    NumericalScaler = pickle.load(NS_File)

with open("PriceScaler.obj", "rb") as PS_File:
    PriceScaler = pickle.load(PS_File)

with open("Encoder.obj", "rb") as Ec_File:
    Encoder = pickle.load(Ec_File)

with open("ColumnNames.obj", "rb") as CN_File:
    ColumnNames = pickle.load(CN_File)

# Load Data

In [45]:
Folder = "E:\\datasets\\car\\"

In [46]:
X = sparse.load_npz(f"{Folder}X.npz")

X_WithOutNA = sparse.load_npz(f"{Folder}X_WithOutNA.npz")

y = sparse.load_npz(f"{Folder}y.npz")

# Dimensionality Reduction

In [56]:
tsvd = TruncatedSVD(n_components=4).fit(X_WithOutNA)

In [57]:
DR = tsvd.transform(X_WithOutNA)

# Split into Training & Testing

In [58]:
X_train, X_test, y_train, y_test = train_test_split(
    DR, np.array(y.todense()).flatten(), test_size=0.20, random_state=1991
)

# K Neighbors Regressor

In [59]:
kn = KNeighborsRegressor(25, weights="distance")
kn.fit(X_train,y_train)

In [60]:
y_pred = kn.predict(X_test)

In [61]:
Comparison = pd.concat(
    [pd.DataFrame(y_pred), pd.DataFrame(y_test)],
    axis=1,
)
Comparison = pd.DataFrame(
    PriceScaler.inverse_transform(Comparison), columns=["Predicted", "Price"]
)
print(len(Comparison))
display(Comparison.head())
r2_score(Comparison["Price"], Comparison["Predicted"])

566895


Unnamed: 0,Predicted,Price
0,1900.0,1900.0
1,12674.580907,13851.04
2,7718.470066,8393.19
3,16494.0,16494.0
4,7034.818321,6250.0


0.8676254771510906

# Scan for outliers

In [62]:
# dbs = HDBSCAN(min_cluster_size=5, leaf_size=100)
# dbs = dbs.fit(EncodedData.iloc[0:10000])
# dbs.labels_
# db = DBSCAN(min_samples=100)
# db.fit(X)
# reg = DecisionTreeRegressor().fit(X, y)
# y_pred = reg.predict(X)
# r2_score(y_pred, y)