In [1]:
import os
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

# Sentence Embeddings

The goal of this notebook is to explore the use of sentence embeddings to get features of 'title' field of the dataset. `SentenceTransformer` library is used to get the sentence embeddings. The embeddings are then used to train a classifier to predict the 'category' field of the dataset.

In [2]:
DATA_FOLDER = '../data/raw/ml-100k'

films = pd.read_csv(
    os.path.join(DATA_FOLDER, 'u.item'),
    sep='|',
    names=['id', 'title', 'release_date', 'video_release_date', 'imdb_url', 'unknown', 'action', 'adventure', 'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'film-noir', 'horror', 'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western'],
    encoding='unicode_escape' 
)
films.describe()

Unnamed: 0,id,video_release_date,unknown,action,adventure,animation,childrens,comedy,crime,documentary,...,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western
count,1682.0,0.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,...,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0,1682.0
mean,841.5,,0.001189,0.149227,0.080262,0.02497,0.072533,0.300238,0.064804,0.029727,...,0.01308,0.014269,0.054697,0.033294,0.036266,0.146849,0.060048,0.149227,0.042212,0.016052
std,485.695893,,0.034473,0.356418,0.271779,0.156081,0.259445,0.458498,0.246253,0.169882,...,0.11365,0.118632,0.227455,0.179456,0.187008,0.354061,0.237646,0.356418,0.201131,0.125714
min,1.0,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,421.25,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,841.5,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1261.75,,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1682.0,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [3]:
# Transform release_date to UNIX timestamp (int64)

films['release_date'] = pd.to_datetime(films['release_date'])
films['release_date'] = films['release_date'].astype('int64')
films.dtypes


id                      int64
title                  object
release_date            int64
video_release_date    float64
imdb_url               object
unknown                 int64
action                  int64
adventure               int64
animation               int64
childrens               int64
comedy                  int64
crime                   int64
documentary             int64
drama                   int64
fantasy                 int64
film-noir               int64
horror                  int64
musical                 int64
mystery                 int64
romance                 int64
sci-fi                  int64
thriller                int64
war                     int64
western                 int64
dtype: object

In [4]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L12-v2')

embeddings = model.encode(films['title'].tolist(), convert_to_tensor=True)
embeddings.shape

torch.Size([1682, 384])

In [5]:
vals = films.drop(columns=['video_release_date', 'imdb_url', 'title']).to_numpy()[:, 1:]

In [7]:
vals = np.concatenate((vals, embeddings.cpu().numpy()), axis=1)
vals.shape

(1682, 404)

In [8]:
user_data = pd.read_csv(
    os.path.join(DATA_FOLDER, 'u.user'),
    sep='|',
    names=['id', 'age', 'gender', 'occupation', 'zip_code']   
)


occupation = pd.get_dummies(user_data['occupation'])
gender = pd.get_dummies(user_data['gender'])
zip_hash = user_data.zip_code.apply(str).apply(hash).apply(lambda x: x >> 16).apply(abs)
u = user_data.copy().drop(columns=['gender', 'occupation', 'zip_code'])
u['zip_code'] = zip_hash
u = u.merge(occupation, left_index=True, right_index=True)
u = u.merge(gender, left_index=True, right_index=True)

u = u.drop(columns=['id'])
u

Unnamed: 0,age,zip_code,administrator,artist,doctor,educator,engineer,entertainment,executive,healthcare,...,other,programmer,retired,salesman,scientist,student,technician,writer,F,M
0,24,97081439203953,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
1,53,123731536730185,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False
2,23,110497903662201,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,True
3,24,34528754647581,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True
4,33,89062497916106,False,False,False,False,False,False,False,False,...,True,False,False,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,26,75839534160777,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
939,32,50504639481037,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,True
940,20,53170602604131,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
941,48,121824543812520,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,True,False


In [9]:
un = u.to_numpy(dtype=np.float32)
un.shape

(943, 25)

### Remark

At the end, I did not use the sentence embeddings to train models and simply dropped the 'title' field. The reason is that the performance of the models did not improve with the use of sentence embeddings. I think the reason is that the 'title' is not descriptive enough to be used as a feature. 