## Transformers

Though we have GPU power, but as we have quite short text fields, we will rather use sentence transformer, also for making experiments quicker. 

In [28]:
from sentence_transformers import SentenceTransformer
emb_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cuda')
emb_model.half()


SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False, 'architecture': 'BertModel'})
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [None]:
import polars as pl
import numpy as np
df = pl.read_csv("../data/filtered/movies.csv")

In [4]:
# combine tagline and description into one text field
df = df.with_columns(
    pl.concat_str(pl.col('tagline'), pl.col('description'), separator=' ').alias('text')
)

In [29]:
text_list = df['text'].to_list()
embeddings = emb_model.encode(text_list)
df = df.with_columns(
    pl.Series("text_embeddings", embeddings.tolist())
)

In [7]:
df.row(1)

(1,
 1000004,
 'Fight Club',
 1999,
 'Mischief. Mayhem. Soap.',
 'A ticking-time-bomb insomniac and a slippery soap salesman channel primal male aggression into a shocking new form of therapy. Their concept catches on, with underground "fight clubs" forming in every town, until an eccentric gets in the way and ignites an out-of-control spiral toward oblivion.',
 139,
 'R',
 4.27,
 'Mischief. Mayhem. Soap. A ticking-time-bomb insomniac and a slippery soap salesman channel primal male aggression into a shocking new form of therapy. Their concept catches on, with underground "fight clubs" forming in every town, until an eccentric gets in the way and ignites an out-of-control spiral toward oblivion.',
 [-0.06121826171875,
  -0.00634765625,
  -0.004726409912109375,
  0.008453369140625,
  0.01248931884765625,
  -0.0006728172302246094,
  0.1322021484375,
  0.0238189697265625,
  -0.027557373046875,
  -0.0201873779296875,
  0.0129547119140625,
  -0.0163726806640625,
  0.0278778076171875,
  0.04

In [23]:
df_train = df.filter(pl.col('date') < 2015)
df_val = df.filter(pl.col('date') >= 2015)

X_train = df_train.select('text_embeddings').to_numpy()
y_train = df_train.select('rating').to_numpy()

X_val = df_val.select('text_embeddings').to_numpy()
y_val = df_val.select('rating').to_numpy()


In [24]:
X_train = [x[0] for x in X_train]
X_val = [x[0] for x in X_val]

X_train = np.array(X_train)
X_val = np.array(X_val)


X_train.shape

(9094, 384)

In [26]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
import numpy as np
from sklearn.metrics import mean_squared_error


models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=0.1),
}

for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    print(f"{model_name} RMSE: {rmse}")
regressor = models['Ridge Regression']

Linear Regression RMSE: 0.5764623933833773
Ridge Regression RMSE: 0.5763471346635302


In [32]:
print(f" min {min(y_pred)}:  max {max(y_pred)}") 


 min 2.42595460005531:  max 3.685950085988353


# Conclusion:
text only model performs worse than numerical model (lacks more context about movie) and has problems with edge parts of rating distribution

## Extra: validating embeddings with cosime similarity search

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def search_movies_by_keyword(keyword, df, model, regressor, top_k=10):
    keyword_embedding = model.encode([keyword])
    movie_embeddings = np.array([emb for emb in df['text_embeddings'].to_list()])
    similarities = cosine_similarity(keyword_embedding, movie_embeddings)[0]
    top_indices = np.argsort(similarities)[::-1][:top_k]
    result_df = df[top_indices].with_columns(
        pl.Series("similarity_score", similarities[top_indices])
    )
    predicted_ratings = regressor.predict(movie_embeddings[top_indices])
    result_df = result_df.with_columns(
        pl.Series("predicted_rating", predicted_ratings)
    )
    return result_df.select(['name', 'date', 'rating', 'predicted_rating', 'text', 'similarity_score'])

killer_movies = search_movies_by_keyword('killer', df, emb_model, regressor, top_k=10)
print("Top 10 movies matching 'killer':")
print(killer_movies)


Top 10 movies matching 'killer':
shape: (10, 6)
┌──────────────────────┬──────┬────────┬──────────────────┬─────────────────────┬──────────────────┐
│ name                 ┆ date ┆ rating ┆ predicted_rating ┆ text                ┆ similarity_score │
│ ---                  ┆ ---  ┆ ---    ┆ ---              ┆ ---                 ┆ ---              │
│ str                  ┆ i64  ┆ f64    ┆ f64              ┆ str                 ┆ f64              │
╞══════════════════════╪══════╪════════╪══════════════════╪═════════════════════╪══════════════════╡
│ Shadowless Sword     ┆ 2005 ┆ 3.23   ┆ 3.144223         ┆ The killer blades   ┆ 0.565168         │
│                      ┆      ┆        ┆                  ┆ versus the k…       ┆                  │
│ The Threat           ┆ 1949 ┆ 3.32   ┆ 3.253464         ┆ KILLER IN JAILBREAK ┆ 0.492794         │
│                      ┆      ┆        ┆                  ┆ on vengean…         ┆                  │
│ Most Likely to Die   ┆ 2015 ┆ 1.85   ┆ 3.