# Component 1: Transforming documents to dense vectors (Word2Vec)

In [3]:
%pip install Tokenizer

import pandas as pd
import sys
sys.path.append('Components')
from Tokenizer import tokenize



Note: you may need to restart the kernel to use updated packages.


## Loading Dataset and Data summary

In [4]:
# Load all movie datasets
print("\n[1/5] Loading datasets...")
dataframes = []
for decade in ['1970s', '1980s', '1990s', '2000s', '2010s', '2020s']:
    df = pd.read_csv(f'data/{decade}-movies.csv')
    df['decade'] = decade
    dataframes.append(df)
    print(f"  ✓ Loaded {len(df)} movies from {decade}")



[1/5] Loading datasets...
  ✓ Loaded 1770 movies from 1970s
  ✓ Loaded 2338 movies from 1980s
  ✓ Loaded 3105 movies from 1990s
  ✓ Loaded 4416 movies from 2000s
  ✓ Loaded 4960 movies from 2010s
  ✓ Loaded 1241 movies from 2020s


In [5]:
# Combine all movies
all_movies = pd.concat(dataframes, ignore_index=True)
print(f"\nTotal movies loaded: {len(all_movies)}")



Total movies loaded: 17830


In [6]:
# Data exploration
print("\n[2/5] Data Exploration...")
print("-"*80)
print("\nDataset Info:")
print(f"  - Columns: {list(all_movies.columns)}")
print(f"  - Shape: {all_movies.shape}")
print(f"  - Missing values: {all_movies.isnull().sum().to_dict()}")

print("\n\nFirst 3 movies:")
print("-"*80)
for i in range(3):
    movie = all_movies.iloc[i]
    print(f"\n{i+1}. {movie['title']} ({movie['decade']})")
    plot_preview = movie['plot'][:150] + "..." if len(movie['plot']) > 150 else movie['plot']
    print(f"   Plot: {plot_preview}")

print("\n\nMovies per decade:")
print(all_movies['decade'].value_counts().sort_index())


[2/5] Data Exploration...
--------------------------------------------------------------------------------

Dataset Info:
  - Columns: ['title', 'image', 'plot', 'decade']
  - Shape: (17830, 4)
  - Missing values: {'title': 0, 'image': 0, 'plot': 0, 'decade': 0}


First 3 movies:
--------------------------------------------------------------------------------

1. 'Gator Bait (1970s)
   Plot: The film follows a barefoot poacher named Desiree Thibodeau who lives deep in the swampland. Ben Bracken and Deputy Billy Boy find Desiree trapping al...

2. ...And Justice for All (film) (1970s)
   Plot: Arthur Kirkland, a Baltimore defense attorney, is in jail on a contempt of court charge after punching Judge Henry T. Fleming while arguing the case o...

3. 10 (1979 film) (1970s)
   Plot: During a surprise 42nd birthday party for the wealthy and famous composer George Webber thrown by his actress girlfriend Samantha Taylor, George finds...


Movies per decade:
decade
1970s    1770
1980s    2338

In [7]:
print("\n\nPlot length statistics:")
all_movies['plot_length'] = all_movies['plot'].str.len()
print(f"  - Mean: {all_movies['plot_length'].mean():.0f} characters")
print(f"  - Median: {all_movies['plot_length'].median():.0f} characters")
print(f"  - Min: {all_movies['plot_length'].min():.0f} characters")
print(f"  - Max: {all_movies['plot_length'].max():.0f} characters")




Plot length statistics:
  - Mean: 2700 characters
  - Median: 2867 characters
  - Min: 3 characters
  - Max: 66145 characters


## Tokenization

In [8]:
# Tokenize all documents
print("\n[3/5] Tokenizing documents...")
print("Processing movie titles and plots...")
print("Using NLTK tokenization with stemming (Porter) for optimal IR performance...")

# Tokenize each movie (title + plot)
# For indexing: use stemming and remove stopwords
all_movies['tokens'] = all_movies.apply(
    lambda row: tokenize(str(row['title']) + ' ' + str(row['plot']), 
                        remove_stopwords=True, 
                        apply_stemming=True),
    axis=1
)


[3/5] Tokenizing documents...
Processing movie titles and plots...
Using NLTK tokenization with stemming (Porter) for optimal IR performance...


In [9]:
# Count total tokens
total_tokens = sum(len(tokens) for tokens in all_movies['tokens'])
print(f"  ✓ Processed {len(all_movies)} documents")
print(f"  ✓ Total tokens: {total_tokens:,}")

# Tokenization analysis
print("\n[4/5] Tokenization Analysis...")
print("-"*80)

  ✓ Processed 17830 documents
  ✓ Total tokens: 4,577,616

[4/5] Tokenization Analysis...
--------------------------------------------------------------------------------


In [10]:
# Token count per document
all_movies['token_count'] = all_movies['tokens'].apply(len)
print(f"\nTokens per document statistics:")
print(f"  - Mean: {all_movies['token_count'].mean():.1f} tokens")
print(f"  - Median: {all_movies['token_count'].median():.1f} tokens")
print(f"  - Min: {all_movies['token_count'].min()} tokens")
print(f"  - Max: {all_movies['token_count'].max()} tokens")



Tokens per document statistics:
  - Mean: 256.7 tokens
  - Median: 271.0 tokens
  - Min: 4 tokens
  - Max: 6243 tokens


In [11]:
# Build vocabulary
print(f"\nBuilding vocabulary...")
vocabulary = set()
for tokens in all_movies['tokens']:
    vocabulary.update(tokens)
print(f"  ✓ Unique tokens in vocabulary: {len(vocabulary):,}")

# Most common tokens
from collections import Counter
all_tokens_flat = [token for tokens in all_movies['tokens'] for token in tokens]
token_freq = Counter(all_tokens_flat)
print(f"\nTop 20 most frequent tokens:")
for token, count in token_freq.most_common(20):
    print(f"  {token:20s} : {count:6,} occurrences")



Building vocabulary...
  ✓ Unique tokens in vocabulary: 65,429

Top 20 most frequent tokens:
  kill                 : 24,252 occurrences
  find                 : 22,815 occurrences
  film                 : 19,636 occurrences
  take                 : 18,517 occurrences
  tell                 : 17,799 occurrences
  one                  : 17,778 occurrences
  get                  : 17,059 occurrences
  leav                 : 16,972 occurrences
  back                 : 14,523 occurrences
  return               : 13,432 occurrences
  two                  : 13,174 occurrences
  friend               : 12,973 occurrences
  home                 : 12,717 occurrences
  tri                  : 12,515 occurrences
  hous                 : 12,427 occurrences
  father               : 12,423 occurrences
  new                  : 12,150 occurrences
  later                : 12,125 occurrences
  make                 : 12,038 occurrences
  go                   : 11,718 occurrences


In [12]:
# Show sample
print("\n[5/5] Sample tokenized documents:")
print("-"*80)
for i in range(3):
    movie = all_movies.iloc[i]
    print(f"\n{i+1}. {movie['title']} ({movie['decade']})")
    print(f"   Original plot length: {len(movie['plot'])} chars")
    print(f"   Tokens ({len(movie['tokens'])}): {movie['tokens'][:15]}...")



[5/5] Sample tokenized documents:
--------------------------------------------------------------------------------

1. 'Gator Bait (1970s)
   Original plot length: 521 chars
   Tokens (59): ['bait', 'film', 'follow', 'barefoot', 'poacher', 'name', 'desire', 'thibodeau', 'live', 'deep', 'swampland', 'ben', 'bracken', 'deputi', 'billi']...

2. ...And Justice for All (film) (1970s)
   Original plot length: 5112 chars
   Tokens (483): ['justic', 'film', 'arthur', 'kirkland', 'baltimor', 'defens', 'attorney', 'jail', 'contempt', 'court', 'charg', 'punch', 'judg', 'henri', 'fleme']...

3. 10 (1979 film) (1970s)
   Original plot length: 3070 chars
   Tokens (284): ['10', '1979', 'film', 'surpris', '42nd', 'birthday', 'parti', 'wealthi', 'famou', 'compos', 'georg', 'webber', 'thrown', 'actress', 'girlfriend']...


## To dense Vectors using Word2Vec Embeddings

In [14]:
# Word2Vec training
print("\n[6/6] Training Word2Vec model on tokenized documents...")
from gensim.models import Word2Vec

# Hyper-parameters (scelta ragionevole per l'assignment)
VECTOR_SIZE = 200   # dimensione embedding documento (puoi citare nel report)
WINDOW = 5          # contesto locale
MIN_COUNT = 5       # ignora parole troppo rare
WORKERS = 4         # thread (puoi adattare alla tua macchina)
SG = 1              # 1 = Skip-gram, 0 = CBOW

sentences = list(all_movies['tokens'])  # lista di liste di token

w2v_model = Word2Vec(
    sentences=sentences,
    vector_size=VECTOR_SIZE,
    window=WINDOW,
    min_count=MIN_COUNT,
    workers=WORKERS,
    sg=SG
)

print("  ✓ Word2Vec model trained.")
print(f"  - Vocabulary size: {len(w2v_model.wv.key_to_index):,} tokens")
print(f"  - Embedding dimension: {w2v_model.vector_size}")



[6/6] Training Word2Vec model on tokenized documents...


Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_float'
Exception ignored in: 'gensim.models.word2vec_inner.our_dot_fl

  ✓ Word2Vec model trained.
  - Vocabulary size: 29,086 tokens
  - Embedding dimension: 200


In [None]:
import numpy as np
from numpy.linalg import norm

print("\n[7/6] Computing document-level embeddings...")

def document_embedding(tokens, model, vector_size):
    """
    Compute a document embedding as the mean of its token embeddings.
    
    Args:
        tokens: list of string tokens for the document
        model: trained gensim Word2Vec model
        vector_size: embedding dimension (int)
        
    Returns:
        1D numpy array of shape (vector_size,)
    """
    word_vectors = model.wv
    # prendi solo i token presenti nel vocabolario
    valid_tokens = [t for t in tokens if t in word_vectors.key_to_index]
    
    if not valid_tokens:
        # documento senza token noti → vettore nullo
        return np.zeros(vector_size, dtype=np.float32)
    
    vecs = np.vstack([word_vectors[t] for t in valid_tokens])
    return vecs.mean(axis=0).astype(np.float32)


doc_vectors = []
for tokens in all_movies['tokens']:
    doc_vec = document_embedding(tokens, w2v_model, VECTOR_SIZE)
    doc_vectors.append(doc_vec)

doc_matrix = np.vstack(doc_vectors)  # shape: (n_docs, VECTOR_SIZE)
print(f"  ✓ Document embeddings computed.")
print(f"  - Shape of doc_matrix: {doc_matrix.shape}")



[7/6] Computing document-level embeddings...
  ✓ Document embeddings computed.
  - Shape of doc_matrix: (17830, 200)


In [None]:
print("\n[8/6] L2-normalizing document embeddings (for cosine similarity)...")

# Evita divisione per zero
norms = norm(doc_matrix, axis=1, keepdims=True)
norms[norms == 0.0] = 1.0  # i vettori zero restano zero

doc_matrix_normalized = doc_matrix / norms

print("  ✓ Normalization done.")
print(f"  - Example norm before: {norm(doc_matrix[0]):.4f}")
print(f"  - Example norm after:  {norm(doc_matrix_normalized[0]):.4f}")



[8/6] L2-normalizing document embeddings (for cosine similarity)...
  ✓ Normalization done.
  - Example norm before: 1.5168
  - Example norm after:  1.0000


In [None]:
# Save for later components (Vector Quantization, LSH, etc.)
print("\n[9/6] Saving document embeddings to disk...")

np.save("data/processed/doc_vectors_w2v.npy", doc_matrix_normalized)
all_movies[['title', 'decade']].to_csv("data/processed/doc_metadata.csv", index=False)

print("  ✓ Saved:")
print("    - data/processed/doc_vectors_w2v.npy (document vectors)")
print("    - data/processed/doc_metadata.csv (titles and metadata)")



[9/6] Saving document embeddings to disk...
  ✓ Saved:
    - data/processed/doc_vectors_w2v.npy (document vectors)
    - data/processed/doc_metadata.csv (titles and metadata)
