### Data preparation

In [1]:
import pandas as pd
from tqdm import tqdm
import unidecode
import re

tqdm.pandas(desc="Processing")

df = pd.read_parquet('../data/raw_data.parquet')

with open('../data/stop_words_french.txt', 'r', encoding='utf-8') as file:
    french_stop_words = [line.strip() for line in file]

In [2]:
exploded_labels = df['labels'].explode()
blacklisted_labels =  ['Accueil', 'Collège', 'Lycée', 'Livres', 'Littérature']

def normalize_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = unidecode.unidecode(text)
        text = re.sub(r'\s+', '_', text).strip('_')
    return text

def clean_label(label):
    label = label.strip()
    if label in blacklisted_labels:
        return None
    if not label:
        return None
    if re.search(r'[0-9]', label) or re.search(r'[\n\t]', label):
        return None
    return normalize_text(label)

cleaned_labels = exploded_labels.apply(clean_label).dropna()

cleaned_labels_grouped = cleaned_labels.groupby(level=0).agg(list)

df['labels'] = cleaned_labels_grouped

In [3]:
df_info = pd.json_normalize(df['information'])

df = pd.concat([df.drop(columns=['information']), df_info], axis=1)

In [4]:
df['Date de parution'] = pd.to_datetime(df['Date de parution'], format='%d/%m/%Y')

df['Nb. de pages'] = df['Nb. de pages'].str.extract(r'(\d+)').astype(float).fillna(-1).astype(int)

df['Poids'] = df['Poids'].str.extract(r'([\d.]+)').astype(float)

df['EAN'] = df['EAN'].astype(int)

In [5]:
categorical_columns = ['author', 'Collection', 'Editeur', 'Format', 'Présentation']

for column in categorical_columns:
    df[column] = df[column].apply(normalize_text)

for column in categorical_columns:
    df[column] = df[column].astype('category')

for column in categorical_columns:
    df[column + '_label'] = df[column].cat.codes

df.columns = [normalize_text(col) for col in df.columns]

In [6]:
dimensions_pattern = r'(\d+,\d+) cm × (\d+,\d+) cm × (\d+,\d+) cm'
df[['width', 'height', 'depth']] = df['dimensions'].str.extract(dimensions_pattern)

df['width'] = df['width'].str.replace(',', '.').astype(float)
df['height'] = df['height'].str.replace(',', '.').astype(float)
df['depth'] = df['depth'].str.replace(',', '.').astype(float)

df.drop(columns=['dimensions'], inplace=True)

In [7]:
df.to_parquet('../data/cleaned_data.parquet')

### Embeddings

In [8]:
import torch
from transformers import CamembertModel, CamembertTokenizer

In [9]:
model_name = 'camembert-base'
tokenizer = CamembertTokenizer.from_pretrained(model_name)
model = CamembertModel.from_pretrained(model_name)

def get_embedding(text, tokenizer, model, max_length):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
    with torch.no_grad():
        outputs = model(**inputs)
    embedding = outputs.last_hidden_state[:, 0, :].numpy()
    return embedding

def apply_embedding(column, tokenizer, model, max_length):
    tqdm.pandas()
    return column.progress_apply(lambda x: get_embedding(x, tokenizer, model, max_length))

df['resume_embedding'] = apply_embedding(df['resume'], tokenizer, model, 512)
df['product_title_embedding'] = apply_embedding(df['product_title'], tokenizer, model, 64)

100%|██████████████████████████████████████████████████| 3323/3323 [05:50<00:00,  9.49it/s]
100%|██████████████████████████████████████████████████| 3323/3323 [01:53<00:00, 29.34it/s]


In [10]:
df['resume_embedding'] = df['resume_embedding'].apply(lambda x: x.flatten().tolist())
df['product_title_embedding'] = df['product_title_embedding'].apply(lambda x: x.flatten().tolist())

df.to_parquet('../data/vectorized_data.parquet')

### TF-IDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
combined_text = df['resume'].tolist() + df['product_title'].tolist()

tfidf_vectorizer = TfidfVectorizer(stop_words=french_stop_words, max_features=1024)

tfidf_vectorizer.fit(combined_text)

def apply_tfidf(column, vectorizer):
    tqdm.pandas()
    tfidf_matrix = vectorizer.transform(column)
    return tfidf_matrix.toarray().tolist()

df['resume_tfidf'] = apply_tfidf(df['resume'], tfidf_vectorizer)
df['product_title_tfidf'] = apply_tfidf(df['product_title'], tfidf_vectorizer)

df.to_parquet('../data/tfidf_data.parquet')



### Recommandations

In [13]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder

In [14]:
def compute_similarity(embeddings):
    similarity_matrix = cosine_similarity(embeddings)
    return similarity_matrix

def get_top_n_similar_items(index, similarity_matrix, n=5):
    similar_indices = np.argsort(similarity_matrix[index])[::-1][1:n+1]
    return similar_indices

def compute_combined_similarity(df, embedding_weights, categorical_weights, labels_weight, n=5):
    resume_embeddings = np.array(df['resume_embedding'].tolist())
    product_title_embeddings = np.array(df['product_title_embedding'].tolist())
    resume_tfidf = np.array(df['resume_tfidf'].tolist())
    product_title_tfidf = np.array(df['product_title_tfidf'].tolist())

    resume_similarity_matrix = compute_similarity(resume_embeddings)
    product_title_similarity_matrix = compute_similarity(product_title_embeddings)
    resume_tfidf_similarity_matrix = compute_similarity(resume_tfidf)
    product_title_tfidf_similarity_matrix = compute_similarity(product_title_tfidf)

    scaled_resume_similarity_matrix = resume_similarity_matrix * embedding_weights['resume']
    scaled_product_title_similarity_matrix = product_title_similarity_matrix * embedding_weights['product_title']
    scaled_resume_tfidf_similarity_matrix = resume_tfidf_similarity_matrix * embedding_weights['resume_tfidf']
    scaled_product_title_tfidf_similarity_matrix = product_title_tfidf_similarity_matrix * embedding_weights['product_title_tfidf']

    combined_similarity_matrix = (scaled_resume_similarity_matrix +
                                  scaled_product_title_similarity_matrix +
                                  scaled_resume_tfidf_similarity_matrix +
                                  scaled_product_title_tfidf_similarity_matrix)

    categorical_columns_with_labels = ['author_label', 'collection_label', 'editeur_label']
    categorical_labels = df[categorical_columns_with_labels].values

    encoder = OneHotEncoder(sparse_output=False)
    categorical_labels_one_hot = encoder.fit_transform(categorical_labels)

    num_categories = [len(encoder.categories_[i]) for i in range(len(categorical_columns_with_labels))]

    weight_matrix = np.concatenate([np.full(num_cat, weight) for num_cat, weight in zip(num_categories, categorical_weights.values())])

    scaled_categorical_labels = categorical_labels_one_hot * weight_matrix

    mlb = MultiLabelBinarizer()
    labels_binary_matrix = mlb.fit_transform(df['labels'])
    scaled_labels_binary_matrix = labels_binary_matrix * labels_weight

    combined_features = np.hstack((resume_embeddings, product_title_embeddings, resume_tfidf, product_title_tfidf, scaled_categorical_labels, scaled_labels_binary_matrix))

    combined_similarity_matrix = compute_similarity(combined_features)

    top_n_similar_items = get_top_n_similar_items(0, combined_similarity_matrix, n=n)

    df['combined_vector'] = combined_features.tolist()

    columns_to_drop = ['resume_embedding', 'product_title_embedding', 'resume_tfidf', 'product_title_tfidf'] + [ col for col in df.columns if '_label' in col ]
    df_cleaned = df.drop(columns=columns_to_drop)

    return df_cleaned, top_n_similar_items

In [15]:
embedding_weights = {
    'resume': 0.3,
    'product_title': 0.3,
    'resume_tfidf': 0.2,
    'product_title_tfidf': 0.2
}

categorical_weights = {
    'author_label': 0.5,
    'collection_label': 0.3,
    'editeur_label': 0.2
}

labels_weight = 1

df_cleaned, top_n_similar_items = compute_combined_similarity(df, embedding_weights, categorical_weights, labels_weight, n=5)

In [16]:
df_cleaned.to_parquet('../data/combined_data.parquet')