# Project 2 — Content-Based Recommendation & Market Segmentation (Consolidated)
This notebook consolidates **Problem 1** (Content-based Recommendation using **Cosine similarity** and **Gensim**) and **Problem 2** (Market Segmentation / **Clustering**) in **two environments**:
- **Scikit-learn / Python (local ML)**
- **PySpark (distributed ML)**

Data source: `data_motobikes.xlsx` (or `data_motorbikes.xlsx` fallback).

> Notes: Run the install cell if your environment lacks the required libraries (e.g. `gensim`, `pyvi`, `pyspark`).


In [None]:

# If running on a fresh environment, uncomment to install packages
# !pip install gensim pyvi pyspark scikit-learn seaborn matplotlib pandas numpy

import os, warnings, re, json
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.decomposition import PCA

# Gensim (content-based alt)
from gensim import corpora, models, similarities

# Vietnamese tokenization (optional)
try:
    from pyvi.ViTokenizer import tokenize as vi_tokenize
except Exception:
    vi_tokenize = None

# Try to load stop words if present
STOP_WORD_FILE = "vietnamese-stopwords.txt"
stop_words = []
if os.path.exists(STOP_WORD_FILE):
    with open(STOP_WORD_FILE, 'r', encoding='utf-8') as f:
        stop_words = [w.strip() for w in f.read().splitlines() if w.strip()]
else:
    # Minimal fallback (extend as needed)
    stop_words = ['và', 'là', 'của', 'những', 'các', 'một', 'như', 'với', 'cho', 'đã', 'đang']
print(f"Loaded stop words: {len(stop_words)} terms from {STOP_WORD_FILE} (exists={os.path.exists(STOP_WORD_FILE)})")


In [None]:

# Load dataset from Excel (Chợ Tốt motorbikes ~7000+ rows)
excel_paths = [
    'data_motobikes.xlsx',  # common name from prior work
    'data_motorbikes.xlsx'  # name in topic PDF
]
path = None
for p in excel_paths:
    if os.path.exists(p):
        path = p
        break
if path is None:
    raise FileNotFoundError("Neither data_motobikes.xlsx nor data_motorbikes.xlsx found in working directory.")

print(f"Using dataset: {path}")
df = pd.read_excel(path, engine='openpyxl')
print(df.shape)
df.head()


In [None]:

# --- Basic cleaning/typing ---
import numpy as np

def clean_price(x):
    if isinstance(x, str):
        x = x.replace('.', '').replace(' đ', '').strip()
        try:
            return float(x)
        except:
            return np.nan
    return x

def clean_range(x):
    # "72.53 tr" -> 72.53 * 1e6
    if isinstance(x, str):
        x = x.replace(' tr', '').replace(',', '.').strip()
        try:
            return float(x) * 1_000_000
        except:
            return np.nan
    return x

def clean_year(x):
    if isinstance(x, str) and 'trước năm 1980' in x.lower():
        return 1980
    try:
        return int(x)
    except:
        return np.nan

for col in df.columns:
    if col.lower() == 'giá':
        df[col] = df[col].apply(clean_price)
    elif col.lower() == 'khoảng giá min':
        df[col] = df[col].apply(clean_range)
    elif col.lower() == 'khoảng giá max':
        df[col] = df[col].apply(clean_range)
    elif col.lower() == 'năm đăng ký':
        df[col] = df[col].apply(clean_year)

# Remove price outliers by IQR rule
Q1 = df['Giá'].quantile(0.25)
Q3 = df['Giá'].quantile(0.75)
IQR = Q3 - Q1
lower_bound, upper_bound = Q1 - 1.5 * IQR, Q3 + 1.5 * IQR

df_clean = df[(df['Giá'] >= lower_bound) & (df['Giá'] <= upper_bound)].copy()

# Impute
categorical_cols = [c for c in df_clean.columns if df_clean[c].dtype == 'object']
numeric_cols = [c for c in df_clean.columns if pd.api.types.is_numeric_dtype(df_clean[c])]
for col in numeric_cols:
    df_clean[col] = df_clean[col].fillna(df_clean[col].median())
for col in categorical_cols:
    df_clean[col] = df_clean[col].fillna('Không rõ')

CURRENT_YEAR = datetime.now().year
if 'Năm đăng ký' in df_clean.columns:
    df_clean['Tuổi của xe'] = CURRENT_YEAR - df_clean['Năm đăng ký']
else:
    df_clean['Tuổi của xe'] = np.nan

# --- Text normalization ---
text_cols_base = ['Tiêu đề', 'Mô tả chi tiết', 'Thương hiệu', 'Địa chỉ', 'Dòng xe']

def normalize_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    # tokenize with pyvi if available
    if vi_tokenize:
        text = vi_tokenize(text)
    # remove non-word chars
    text = re.sub(r'\W+', ' ', text)
    # remove stop words
    text = ' '.join([w for w in text.split() if w not in stop_words])
    return text.strip()

for col in text_cols_base:
    if col in df_clean.columns:
        df_clean[f'{col}_normalized'] = df_clean[col].apply(normalize_text)

# Combined text feature
df_clean['Text_combined'] = (
    df_clean.get('Tiêu đề_normalized', '') + ' ' +
    df_clean.get('Mô tả chi tiết_normalized', '') + ' ' +
    df_clean.get('Thương hiệu_normalized', '') + ' ' +
    df_clean.get('Địa chỉ_normalized', '') + ' ' +
    df_clean.get('Dòng xe_normalized', '')
)

# Feature engineering (numerical)
df_fe = df_clean.copy()
df_fe['Giá trung bình'] = (df_fe['Khoảng giá min'] + df_fe['Khoảng giá max']) / 2
# Avoid zero div
den = df_fe['Giá trung bình'].replace(0, np.nan)
df_fe['Tỷ lệ giá'] = df_fe['Giá'] / den
# km/year
if 'Số Km đã đi' in df_fe.columns:
    df_fe['Km per year'] = df_fe['Số Km đã đi'] / (df_fe['Tuổi của xe'] + 1e-5)
else:
    df_fe['Km per year'] = np.nan

# log transforms
for c in ['Giá', 'Số Km đã đi']:
    if c in df_fe.columns:
        df_fe[f'{c}_log'] = np.log1p(df_fe[c])

# Scale numeric features
num_cols = [c for c in ['Giá','Khoảng giá min','Khoảng giá max','Năm đăng ký','Số Km đã đi','Tuổi của xe','Tỷ lệ giá','Km per year','Giá_log','Số Km đã đi_log'] if c in df_fe.columns]
scaler = StandardScaler()
df_fe[num_cols] = scaler.fit_transform(df_fe[num_cols])

print("Cleaned shape:", df_fe.shape)



## Problem 1 — Content-Based Recommendation (Scikit-learn & Gensim)
We build item embeddings from textual attributes using **TF–IDF** and compute **cosine similarity** for top-N similar bikes. In parallel, we also build a **Gensim** TF–IDF + **SparseMatrixSimilarity** index as an alternative.


In [None]:

# --- TF–IDF with scikit-learn ---
max_features_grid = [3000, 5000]
ngram_grid = [(1,1), (1,2)]
svd_components = 200  # reduce dim for efficiency

# Heuristic metric to tune text params (no interaction data):
# For each item, compute top-10 similar items and measure brand consistency (% same brand)
def brand_consistency(sim_matrix, brands, topk=10):
    n = sim_matrix.shape[0]
    if brands is None:
        return 0.0
    same = 0
    total = 0
    for i in range(n):
        sims = sim_matrix[i]
        idx = np.argpartition(-sims, range(1, topk+1))[1:topk+1]
        base = brands[i]
        total += len(idx)
        same += sum(1 for j in idx if brands[j] == base)
    return same / max(total, 1)

best_score = -1
best_cfg = None
best_sim = None
best_vectorizer = None

brands = df_fe['Thương hiệu'] if 'Thương hiệu' in df_fe.columns else None

for mf in max_features_grid:
    for ng in ngram_grid:
        vect = TfidfVectorizer(max_features=mf, ngram_range=ng, analyzer='word', stop_words=stop_words, min_df=2)
        X = vect.fit_transform(df_fe['Text_combined'].fillna(''))
        # Optional dimensionality reduction
        if svd_components and X.shape[1] > svd_components:
            svd = TruncatedSVD(n_components=svd_components, random_state=42)
            Xr = svd.fit_transform(X)
            # cosine on reduced
            sim = cosine_similarity(Xr)
        else:
            sim = cosine_similarity(X)
        score = brand_consistency(sim, brands)
        if score > best_score:
            best_score = score
            best_cfg = (mf, ng)
            best_sim = sim
            best_vectorizer = vect

print(f"Best TF-IDF cfg: max_features={best_cfg[0]}, ngram={best_cfg[1]} | brand_consistency={best_score:.3f}")

# Recommendation function using best_sim
def recommend_by_id(df_source, sim_matrix, id_col='id', item_id=None, topn=5):
    if item_id is None:
        raise ValueError('Provide item_id')
    if id_col not in df_source.columns:
        raise ValueError(f'{id_col} not in DataFrame')
    indices = df_source.index[df_source[id_col] == item_id].tolist()
    if not indices:
        raise ValueError(f'Item id {item_id} not found')
    i = indices[0]
    sims = sim_matrix[i]
    # Top-N excluding itself
    idx = np.argsort(-sims)
    idx = [j for j in idx if j != i][:topn]
    return df_source.iloc[idx]

# Example
if 'id' in df_fe.columns:
    try:
        example_id = int(df_fe['id'].iloc[0])
    except:
        example_id = df_fe.index[0]
    recs = recommend_by_id(df_fe, best_sim, item_id=example_id, topn=5)
    recs[['id','Tiêu đề','Thương hiệu','Giá']].head()


In [None]:

# --- Gensim TF–IDF + SparseMatrixSimilarity ---
# Build tokenized content
texts = [str(x) for x in df_fe['Text_combined'].fillna('')]
# Simple tokenization (pyvi tokenization already applied in normalize_text)
# Convert to lists for gensim
content_tokens = [t.split() for t in texts]
# Remove stopwords
content_tokens = [[w for w in doc if w not in stop_words] for doc in content_tokens]

# Dictionary & corpus
dictionary = corpora.Dictionary(content_tokens)
corpus = [dictionary.doc2bow(doc) for doc in content_tokens]

# TF-IDF model & similarity index
model_tfidf = models.TfidfModel(corpus)
index = similarities.SparseMatrixSimilarity(model_tfidf[corpus], num_features=len(dictionary))

# Convert index to dense similarity (WARNING: may be large for big datasets). Here we only compute top-N for a single item.

def gensim_recommend(df_source, item_id, topn=5):
    indices = df_source.index[df_source['id'] == item_id].tolist()
    if not indices:
        raise ValueError(f'Item id {item_id} not found')
    i = indices[0]
    vec = dictionary.doc2bow(content_tokens[i])
    sims = index[model_tfidf[vec]]  # similarities against all docs
    # Top-N excluding itself
    order = np.argsort(-sims)
    order = [j for j in order if j != i][:topn]
    return df_source.iloc[order]

# Example usage
if 'id' in df_fe.columns:
    try:
        example_id = int(df_fe['id'].iloc[0])
    except:
        example_id = df_fe.index[0]
    recs_gem = gensim_recommend(df_fe, item_id=example_id, topn=5)
    recs_gem[['id','Tiêu đề','Thương hiệu','Giá']].head()



### Problem 1 — Content-Based Recommendation (PySpark)
Vectorize content with Spark ML (Tokenizer → StopWordsRemover → HashingTF → IDF → Normalizer), then use **BucketedRandomProjectionLSH** (approximate NN on normalized vectors) to retrieve top-N similar items. On normalized vectors, Euclidean distance correlates with cosine similarity.


In [None]:

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, StringType, DoubleType
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF, Normalizer
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import BucketedRandomProjectionLSH

spark = SparkSession.builder.appName("ContentBasedRecommendation").getOrCreate()

# Create Spark DataFrame with necessary columns
use_cols = ['id','Tiêu đề','Thương hiệu','Giá','Text_combined']
use_cols = [c for c in use_cols if c in df_fe.columns]
sdf = spark.createDataFrame(df_fe[use_cols])

# Tokenize (content is already normalized; still break into tokens)
tok = Tokenizer(inputCol='Text_combined', outputCol='tokens')
sdf_tok = tok.transform(sdf)

# Stop words (Vietnamese)
remover = StopWordsRemover(inputCol='tokens', outputCol='filtered', stopWords=stop_words)
sdf_sw = remover.transform(sdf_tok)

# HashingTF → IDF
htf = HashingTF(inputCol='filtered', outputCol='rawFeatures', numFeatures=1<<14)
sdf_htf = htf.transform(sdf_sw)
idf = IDF(inputCol='rawFeatures', outputCol='features')
idf_model = idf.fit(sdf_htf)
sdf_tfidf = idf_model.transform(sdf_htf)

# Normalize to unit length (approximate cosine via euclidean)
norm = Normalizer(inputCol='features', outputCol='normFeatures')
sdf_norm = norm.transform(sdf_tfidf)

# LSH for approximate nearest neighbors
lsh = BucketedRandomProjectionLSH(inputCol='normFeatures', outputCol='hashes', bucketLength=2.0, numHashTables=5)
lsh_model = lsh.fit(sdf_norm)

# Function to recommend top-N similar for a given id
def spark_recommend(item_id, topn=5):
    item = sdf_norm.filter(col('id') == item_id).limit(1)
    if item.count() == 0:
        raise ValueError(f'Item id {item_id} not found')
    # approxNearestNeighbors uses Euclidean distance; on normalized vectors it correlates with cosine similarity
    result = lsh_model.approxNearestNeighbors(sdf_norm, item.collect()[0]['normFeatures'], topn+1)
    # Exclude itself
    result = result.filter(col('id') != item_id).orderBy(col('distCol').asc())
    return result.select('id','Tiêu đề','Thương hiệu','Giá','distCol').toPandas()

# Example
example_id = int(df_fe['id'].iloc[0]) if 'id' in df_fe.columns else int(df_fe.index[0])
spark_recs = spark_recommend(example_id, topn=5)
spark_recs.head()



## Problem 2 — Market Segmentation (Clustering)
We cluster items in both environments (**Scikit-learn** and **PySpark**) and compare using **Silhouette** and **Davies–Bouldin**.


In [None]:

# Build text embeddings from best TF-IDF (re-use best_vectorizer)
X_text = best_vectorizer.transform(df_fe['Text_combined'].fillna(''))
svd_components = 100
if X_text.shape[1] > svd_components:
    svd2 = TruncatedSVD(n_components=svd_components, random_state=42)
    X_text_r = svd2.fit_transform(X_text)
else:
    X_text_r = X_text.toarray()

# Gather categorical encodings
label_cols = []
for col in ['Thương hiệu','Dòng xe','Loại xe','Dung tích xe','Xuất xứ']:
    if col in df_fe.columns:
        le = LabelEncoder()
        df_fe[f'{col}_label'] = le.fit_transform(df_fe[col].astype(str))
        label_cols.append(f'{col}_label')

num_cols = [c for c in ['Giá','Khoảng giá min','Khoảng giá max','Năm đăng ký','Số Km đã đi','Tuổi của xe','Tỷ lệ giá','Km per year','Giá_log','Số Km đã đi_log'] if c in df_fe.columns]
X_others = df_fe[label_cols + num_cols].values
X = np.hstack([X_text_r, X_others])
X = np.nan_to_num(X)

# Evaluate KMeans/GMM/Agglomerative
k_range = range(2, 11)

sil_scores = []
for k in k_range:
    km = KMeans(n_clusters=k, random_state=42, n_init=10)
    km.fit(X)
    sil_scores.append(silhouette_score(X, km.labels_))
opt_k = k_range[np.argmax(sil_scores)]
print("Optimal K (KMeans):", opt_k)

kmeans = KMeans(n_clusters=opt_k, random_state=42, n_init=10)
kmeans_labels = kmeans.fit_predict(X)
km_sil = silhouette_score(X, kmeans_labels)
km_db  = davies_bouldin_score(X, kmeans_labels)

# GMM
gmm_sils = []
for n in k_range:
    gmm = GaussianMixture(n_components=n, random_state=42)
    gmm.fit(X)
    gmm_sils.append(silhouette_score(X, gmm.predict(X)))
opt_g = k_range[np.argmax(gmm_sils)]
gmm = GaussianMixture(n_components=opt_g, random_state=42)
gmm_labels = gmm.fit_predict(X)
gmm_sil = silhouette_score(X, gmm_labels)
gmm_db  = davies_bouldin_score(X, gmm_labels)

# Agglomerative
agg = AgglomerativeClustering(n_clusters=opt_k, linkage='ward')
agg_labels = agg.fit_predict(X)
agg_sil = silhouette_score(X, agg_labels)
agg_db  = davies_bouldin_score(X, agg_labels)

pd.DataFrame({
    'Model': ['KMeans','GMM','Agglomerative'],
    'Silhouette': [km_sil, gmm_sil, agg_sil],
    'Davies-Bouldin': [km_db, gmm_db, agg_db]
}).sort_values('Silhouette', ascending=False)


In [None]:

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans as SKMeans, GaussianMixture as SGMM, BisectingKMeans
from pyspark.ml.evaluation import ClusteringEvaluator

# Reuse sdf_norm from recommendation section for text features (normFeatures)
# Assemble numeric and label features as well
extra_cols = []
for c in ['Giá','Khoảng giá min','Khoảng giá max','Năm đăng ký','Số Km đã đi','Tuổi của xe','Tỷ lệ giá','Km per year','Giá_log','Số Km đã đi_log']:
    if c in df_fe.columns:
        extra_cols.append(c)
for c in ['Thương hiệu_label','Dòng xe_label','Loại xe_label','Dung tích xe_label','Xuất xứ_label']:
    if c in df_fe.columns:
        extra_cols.append(c)

sdf_extra = spark.createDataFrame(df_fe[extra_cols])
sdf_feat = sdf_norm.select('normFeatures').withColumnRenamed('normFeatures','textFeat')
# Combine columns back by row index
from pyspark.sql.functions import monotonically_increasing_id
sdf_feat = sdf_feat.withColumn('row_id', monotonically_increasing_id())
sdf_extra = sdf_extra.withColumn('row_id', monotonically_increasing_id())
sdf_join = sdf_feat.join(sdf_extra, on='row_id').drop('row_id')

# VectorAssembler (requires numeric vector cols)
# textFeat already vector; assembler can combine via inputCols with textFeat and numeric columns
assembler = VectorAssembler(inputCols=['textFeat'] + [c for c in sdf_extra.columns if c != 'row_id'], outputCol='features')
sdf_vec = assembler.transform(sdf_join)

# Evaluate K range
evaluator = ClusteringEvaluator(predictionCol='prediction', featuresCol='features')
sil_scores = []
for k in range(2, 11):
    km = SKMeans(k=k, seed=42)
    model = km.fit(sdf_vec)
    preds = model.transform(sdf_vec)
    sil = evaluator.evaluate(preds)
    sil_scores.append((k, sil))
opt_k = max(sil_scores, key=lambda x: x[1])[0]
print('Optimal K (Spark KMeans):', opt_k)

km_model = SKMeans(k=opt_k, seed=42).fit(sdf_vec)
preds_km = km_model.transform(sdf_vec)
km_sil = evaluator.evaluate(preds_km)

# Bisecting KMeans
bkm_scores = []
for k in range(2,11):
    bkm = BisectingKMeans(k=k, seed=42)
    m = bkm.fit(sdf_vec)
    p = m.transform(sdf_vec)
    bkm_scores.append((k, evaluator.evaluate(p)))
opt_b = max(bkm_scores, key=lambda x: x[1])[0]
bkm_model = BisectingKMeans(k=opt_b, seed=42).fit(sdf_vec)
preds_b = bkm_model.transform(sdf_vec)
bkm_sil = evaluator.evaluate(preds_b)

# GMM often yields negative silhouette on sparse text; still include for completeness
opt_g = 2
try:
    gmm_model = SGMM(k=opt_g, seed=42).fit(sdf_vec)
    preds_g = gmm_model.transform(sdf_vec)
    gmm_sil = evaluator.evaluate(preds_g)
except Exception as e:
    gmm_sil = float('nan')

pd.DataFrame({'Model':['KMeans','BisectingKMeans','GMM'], 'Silhouette':[km_sil, bkm_sil, gmm_sil]})



### Summary & Next Steps
- **Problem 1 (Content-based)**: Implemented **scikit-learn cosine** with parameter tuning (brand-consistency heuristic), a **Gensim** TF–IDF + similarity index, and a **PySpark** pipeline with LSH for scalable nearest-neighbor retrieval.

- **Problem 2 (Clustering)**: Benchmarked **KMeans**, **GMM**, **Agglomerative** (scikit-learn) and **KMeans / BisectingKMeans / GMM** (PySpark).

> You can now package the top-N similar items as an API or integrate into a UI. For segmentation, consider labeling clusters and mapping them to pricing/actions.
