In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import os
import numpy as np
from scipy.sparse import save_npz, load_npz
import pickle

# Bag-of-Words

In [2]:
def load_docs(category_path):
    documents = []
    filenames = []

    for filename in sorted(os.listdir(category_path)):
        if filename.endswith('.txt'):
            with open(os.path.join(category_path, filename), 'r', encoding='utf-8', errors='ignore') as f:
                documents.append(f.read().lower())  
                filenames.append(filename)
    return documents, filenames

In [3]:
romantic_docs, romantic_files = load_docs('corpus/romantic_blocks')
scifi_docs, scifi_files = load_docs('corpus/scifi_blocks')

In [4]:
all_docs = romantic_docs + scifi_docs
labels = ['romantic'] * len(romantic_docs) + ['scifi'] * len(scifi_docs)

In [5]:
print(f"Total doc: {len(all_docs)}")
print(f"Romantic: {len(romantic_docs)}")
print(f"Sci-Fi: {len(scifi_docs)}")

Total doc: 11420
Romantic: 5382
Sci-Fi: 6038


In [6]:
vectorizer = CountVectorizer(
    max_features= 5000,
    stop_words= 'english',
    min_df= 5,
    max_df= 0.8,
    token_pattern= r'\b[a-zA-Z]{2,}\b'
)

In [7]:
bow_matrix = vectorizer.fit_transform(all_docs) 
feature_names = vectorizer.get_feature_names_out()

In [10]:
print(f"\nBag-of-Words shape: {bow_matrix.shape}")
print(f"Words count: {len(feature_names)}")
print(f"Sparse matrix type: {type(bow_matrix)}")


Bag-of-Words shape: (11420, 2547)
Words count: 2547
Sparse matrix type: <class 'scipy.sparse._csr.csr_matrix'>


In [11]:
os.makedirs('processed_data', exist_ok=True)

In [12]:
save_npz('processed_data/bow_matrix.npz', bow_matrix)

In [13]:
with open('processed_data/feature_names.pkl', 'wb') as f:
    pickle.dump(feature_names, f)

with open('processed_data/labels.pkl', 'wb') as f:
    pickle.dump(labels, f)

with open('processed_data/vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [14]:
print(f"The most common 10 words:")
word_counts = np.array(bow_matrix.sum(axis=0)).flatten()
top_indices = word_counts.argsort()[-10:][::-1]
for idx in top_indices:
    print(f"  {feature_names[idx]}: {word_counts[idx]}")

The most common 10 words:
  looks: 593
  jake: 486
  charlie: 415
  cooper: 407
  protagonist: 365
  like: 364
  rose: 348
  hazel: 326
  door: 277
  room: 275


# Naive Bayes

In [16]:
labels_array = np.array(labels)

In [21]:
romantics= np.where(labels_array == 'romantic')[0]

In [22]:
scifis= np.where(labels_array == 'scifi')[0]

In [27]:
romantic_bow = bow_matrix[romantics]  
romantic_word_counts = np.array(romantic_bow.sum(axis=0)).flatten()

In [26]:
scifi_bow = bow_matrix[scifis]
scifi_word_counts = np.array(scifi_bow.sum(axis=0)).flatten()

In [28]:
vocabulary_size = len(feature_names)

In [29]:
total_romantic_words = romantic_word_counts.sum()
print(f"Total romantic words: {total_romantic_words}")

Total romantic words: 24644


In [30]:
P_w_romantic = (romantic_word_counts + 1) / (total_romantic_words + vocabulary_size)

In [35]:
print(f"P_w_romantic shape: {P_w_romantic.shape}")
print(f"First 5 prob: {P_w_romantic[:5]}")

P_w_romantic shape: (2547,)
First 5 prob: [1.47107499e-04 2.20661248e-04 3.67768747e-05 1.83884374e-04
 1.47107499e-04]


In [36]:
total_scifi_words = scifi_word_counts.sum()
print(f"Total scifi words: {total_scifi_words}")

Total scifi words: 26906


In [37]:
P_w_scifi = (scifi_word_counts + 1) / (total_scifi_words + vocabulary_size)

In [38]:
print(f"P_w_scifi shape: {P_w_scifi.shape}")
print(f"First 5 prob: {P_w_scifi[:5]}")

P_w_scifi shape: (2547,)
First 5 prob: [0.00013581 0.00010186 0.00020371 0.00020371 0.00033952]


In [39]:
llr_romantic = np.log(P_w_romantic) - np.log(P_w_scifi)

In [41]:
print(f"llr_romantic shape: {llr_romantic.shape}")
print(f"First 5 LLR: {llr_romantic[:5]}")

llr_romantic shape: (2547,)
First 5 LLR: [ 0.07990974  0.77305692 -1.71184973 -0.10241182 -0.836381  ]


In [42]:
top_10_romantic_indices = np.argsort(llr_romantic)[-10:][::-1]

In [58]:
print("TOP 10 ROMANTIC WORDS:")
for idx in top_10_romantic_indices:
    word = feature_names[idx]
    llr_value = llr_romantic[idx]
    print(f"  {word}: {llr_value:.4f}")

TOP 10 ROMANTIC WORDS:
  charlie: 6.1106
  rose: 5.9350
  hazel: 5.8699
  jack: 5.6930
  elizabeth: 5.5180
  patrick: 5.3832
  gus: 5.2332
  sam: 5.0635
  cal: 4.7620
  mr: 4.7146


In [59]:
top_10_scifi_indices = np.argsort(llr_romantic)[:10]  

In [61]:
print("TOP 10 SCI-FI WORDS:")
for idx in top_10_scifi_indices:
    word = feature_names[idx]
    llr_value = llr_romantic[idx]
    print(f"  {word}: {llr_value:.4f}")

TOP 10 SCI-FI WORDS:
  jake: -6.1084
  cooper: -5.9314
  protagonist: -5.8227
  korben: -5.1130
  neytiri: -4.8968
  neil: -4.6475
  sator: -4.5645
  mark: -4.4896
  leeloo: -4.4634
  quaritch: -4.2895


In [118]:
character_names = {
    # Romantic
    'charlie', 'rose', 'hazel', 'jack', 'elizabeth', 'patrick', 'gus', 'sam', 'cal', 'mr', 'mrs',
    'darcy', 'bennet', 'bianca', 'jane', 'fabrizio', 'lovejoy', 'bingley', 'frannie',
    'wickham', 'lydia', 'kitty', 'collins', 'catherine', 'georgiana', 'lizzy', 'fitzwilliam',
    'trudy', 'ruth', 'ismay', 'murdoch', 'wilde', 'molly', 'tommy', 'caledon', 'hockley',
    'isaac', 'peter', 'brad', 'candace', 'mary', 'bob', 'bill', 'kate', 'craig', 'alice',
    'michael', 'cameron', 'joey', 'andrews', 'lizzie', 'lovett', 'mandella',
    'smith', 'charlotte', 'brock', 'houten', 'lightoller',
    'titanic', 'helen', 'derek', 'max', 'grove','thomas',
    # Sci-Fi  
    'jake', 'cooper', 'protagonist', 'korben', 'neytiri', 'neil', 'sator', 'mark', 'leeloo', 'quaritch',
    'grace', 'norm', 'tsu', 'eytukan', 'selfridge', 'martinez', 'munro', 'arepo',
    'tey', 'murph', 'doyle', 'roth', 'trudy', 'spellman', 'wainfleet', 'chacon',
    'brand', 'romilly', 'case', 'tars', 'mann', 'johanssen', 'vogel', 'beck', 'lewis',
    'kat', 'priya', 'mahir', 'crosby', 'wheeler', 'ives', 'rohan', 'steward',
    'ruby', 'zorg', 'cornelius', 'vito', 'dallas', 'david', 'priest', 'lindberg', 'mcquarrie',
    'vincent', 'rhod', 'loc', 'volkov', 'vi', 'na',
    'avatar', 'samson'  
}

In [119]:
character_indices = []
for name in character_names:
    if name in feature_names:
        idx = np.where(feature_names == name)[0]
        if len(idx) > 0:
            character_indices.append(idx[0])

In [120]:
mask = np.ones(len(feature_names), dtype=bool)
mask[character_indices] = False

In [121]:
filtered_feature_names = feature_names[mask]
filtered_log_odds = log_odds_romantic[mask]

In [122]:
top_10_filtered_romantic = np.argsort(filtered_log_odds)[-10:][::-1]

In [123]:
print("TOP 10 ROMANTIC WORDS :")
for idx in top_10_filtered_romantic:
    word = filtered_feature_names[idx]
    log_odds_value = filtered_log_odds[idx]
    print(f"  {word}: {log_odds_value:.4f}")

TOP 10 ROMANTIC WORDS :
  bedroom: 8.2767
  aft: 8.0605
  women: 7.9122
  steerage: 7.8288
  letter: 7.7379
  aunt: 7.7379
  afternoon: 7.6891
  sir: 7.5838
  chastity: 7.5838
  movie: 7.5267


In [124]:
top_10_filtered_scifi = np.argsort(filtered_log_odds)[:10]

In [125]:
print("TOP 10 SCI-FI WORDS :")
for idx in top_10_filtered_scifi:
    word = filtered_feature_names[idx]
    log_odds_value = filtered_log_odds[idx]
    print(f"  {word}: {log_odds_value:.4f}")

TOP 10 SCI-FI WORDS :
  probe: -8.3463
  forest: -8.2789
  airlock: -8.2313
  banshee: -8.1286
  link: -8.1286
  hunters: -8.1012
  troopers: -7.8500
  hab: -7.8500
  president: -7.7367
  mask: -7.6959


# Topic Modeling

In [126]:
from sklearn.decomposition import LatentDirichletAllocation

In [139]:
n_topics = 10

In [140]:
lda_model = LatentDirichletAllocation(
    n_components=n_topics,
    random_state=42,
    max_iter=20,
    learning_method='online'
)

In [141]:
lda_model.fit(bow_matrix)

0,1,2
,n_components,10
,doc_topic_prior,
,topic_word_prior,
,learning_method,'online'
,learning_decay,0.7
,learning_offset,10.0
,max_iter,20
,batch_size,128
,evaluate_every,-1
,total_samples,1000000.0


In [142]:
print(f"Topics: {n_topics}")

Topics: 10


In [143]:
def display_topics(model, feature_names, no_top_words=15):
    topics = []
    for topic_idx, topic in enumerate(model.components_):
        
        top_word_indices = topic.argsort()[-no_top_words:][::-1]
        top_words = [feature_names[i] for i in top_word_indices]
        top_probs = [topic[i] for i in top_word_indices]
        
        topics.append({
            'topic_id': topic_idx,
            'words': top_words,
            'probs': top_probs
        })
        
        print(f"\nTopic {topic_idx}:")
        for word, prob in zip(top_words, top_probs):
            print(f"  {word:15s}: {prob:.4f}")
    
    return topics

In [144]:
topics = display_topics(lda_model, feature_names, no_top_words=15)


Topic 0:
  elizabeth      : 195.7857
  hand           : 155.4375
  opens          : 132.5817
  way            : 113.8671
  goes           : 111.1198
  past           : 100.0400
  darcy          : 86.1419
  window         : 75.9207
  run            : 73.3999
  white          : 71.6257
  camera         : 69.2896
  red            : 66.1942
  bow            : 64.4657
  mary           : 61.9589
  planet         : 61.2053

Topic 1:
  protagonist    : 374.2566
  kat            : 223.9074
  away           : 204.6693
  eyes           : 202.7061
  turns          : 170.9503
  just           : 163.8363
  watches        : 163.5840
  sator          : 142.8649
  look           : 140.5029
  black          : 138.2274
  neil           : 123.3118
  small          : 95.0876
  stares         : 89.3679
  forward        : 87.2630
  stops          : 84.1634

Topic 2:
  room           : 258.0809
  water          : 212.5804
  sits           : 155.9062
  walks          : 133.8429
  neytiri        : 110.7835
  j

In [145]:
topic_labels = {
    0: "Pride & Prejudice Scenes",
    1: "Tenet - Action & Time",
    2: "Avatar - Pandora Life", 
    3: "Fifth Element & Charlie",
    4: "Interstellar - Space Mission",
    5: "Mixed Romantic Moments",
    6: "General Action Scenes",
    7: "Titanic & Emotional Scenes",
    8: "The Martian & Avatar Mix",
    9: "Titanic - Ship & Sea"
}

In [146]:
topic_table_data = []

In [147]:
for topic_idx in range(10):
    topic = lda_model.components_[topic_idx]
    top_indices = topic.argsort()[-10:][::-1]  
    
    row = {
        'Topic ID': topic_idx,
        'Topic Label': topic_labels[topic_idx]
    }
    
    for rank, idx in enumerate(top_indices, 1):
        word = feature_names[idx]
        prob = topic[idx]
        row[f'Word {rank}'] = f"{word} ({prob:.4f})"
    
    topic_table_data.append(row)


In [149]:
import pandas as pd

In [150]:
topic_table = pd.DataFrame(topic_table_data)

In [152]:
print(topic_table.to_string(index=False))

 Topic ID                  Topic Label                 Word 1             Word 2             Word 3           Word 4             Word 5            Word 6             Word 7             Word 8           Word 9           Word 10
        0     Pride & Prejudice Scenes   elizabeth (195.7857)    hand (155.4375)   opens (132.5817)   way (113.8671)    goes (111.1198)   past (100.0400)    darcy (86.1419)   window (75.9207)    run (73.3999)   white (71.6257)
        1        Tenet - Action & Time protagonist (374.2566)     kat (223.9074)    away (204.6693)  eyes (202.7061)   turns (170.9503)   just (163.8363) watches (163.5840)   sator (142.8649)  look (140.5029)  black (138.2274)
        2        Avatar - Pandora Life        room (258.0809)   water (212.5804)    sits (155.9062) walks (133.8429) neytiri (110.7835)   jake (107.3246) suddenly (97.2025)     gets (92.0037)     mr (88.3556)    puts (82.5973)
        3      Fifth Element & Charlie        sees (178.6797) charlie (176.2401)     int (15

In [153]:
topic_table.to_csv('topic_modeling_results.csv', index=False)

In [154]:
doc_topic_dist = lda_model.transform(bow_matrix)

In [156]:
print(f"doc_topic_dist shape: {doc_topic_dist.shape}")
print(f"First doc's topic dist:")
print(doc_topic_dist[0])
print(f"Total: {doc_topic_dist[0].sum()}")  

doc_topic_dist shape: (11420, 10)
First doc's topic dist:
[0.03333333 0.03333333 0.03333333 0.7        0.03333333 0.03333333
 0.03333333 0.03333333 0.03333333 0.03333333]
Total: 1.0


In [157]:
romantic_indices = np.where(labels_array == 'romantic')[0]
scifi_indices = np.where(labels_array == 'scifi')[0]

In [158]:
romantic_topic_dist = doc_topic_dist[romantic_indices].mean(axis=0)
scifi_topic_dist = doc_topic_dist[scifi_indices].mean(axis=0)

In [164]:
print("ROMANTIC CATEGORY - Average Topic Distribution:")
for topic_idx in range(10):
    print(f"Topic {topic_idx} ({topic_labels[topic_idx]:30s}): {romantic_topic_dist[topic_idx]:.4f}")

print("-"*50)

print("SCI-FI CATEGORY - Average Topic Distribution:")
for topic_idx in range(10):
    print(f"Topic {topic_idx} ({topic_labels[topic_idx]:30s}): {scifi_topic_dist[topic_idx]:.4f}")


ROMANTIC CATEGORY - Average Topic Distribution:
Topic 0 (Pride & Prejudice Scenes      ): 0.0997
Topic 1 (Tenet - Action & Time         ): 0.1054
Topic 2 (Avatar - Pandora Life         ): 0.0956
Topic 3 (Fifth Element & Charlie       ): 0.0992
Topic 4 (Interstellar - Space Mission  ): 0.0910
Topic 5 (Mixed Romantic Moments        ): 0.1090
Topic 6 (General Action Scenes         ): 0.1000
Topic 7 (Titanic & Emotional Scenes    ): 0.1198
Topic 8 (The Martian & Avatar Mix      ): 0.0759
Topic 9 (Titanic - Ship & Sea          ): 0.1043
--------------------------------------------------
SCI-FI CATEGORY - Average Topic Distribution:
Topic 0 (Pride & Prejudice Scenes      ): 0.0829
Topic 1 (Tenet - Action & Time         ): 0.1185
Topic 2 (Avatar - Pandora Life         ): 0.0917
Topic 3 (Fifth Element & Charlie       ): 0.0847
Topic 4 (Interstellar - Space Mission  ): 0.1257
Topic 5 (Mixed Romantic Moments        ): 0.0984
Topic 6 (General Action Scenes         ): 0.0894
Topic 7 (Titanic & Emo

In [166]:
print("TOP 5 TOPICS PER CATEGORY:")
top_5_romantic = romantic_topic_dist.argsort()[-5:][::-1]
print("\nROMANTIC - Top 5 Topics:")
for rank, topic_idx in enumerate(top_5_romantic, 1):
    print(f"  {rank}. Topic {topic_idx} ({topic_labels[topic_idx]}): {romantic_topic_dist[topic_idx]:.4f}")

top_5_scifi = scifi_topic_dist.argsort()[-5:][::-1]
print("\nSCI-FI - Top 5 Topics:")
for rank, topic_idx in enumerate(top_5_scifi, 1):
    print(f"  {rank}. Topic {topic_idx} ({topic_labels[topic_idx]}): {scifi_topic_dist[topic_idx]:.4f}")

TOP 5 TOPICS PER CATEGORY:

ROMANTIC - Top 5 Topics:
  1. Topic 7 (Titanic & Emotional Scenes): 0.1198
  2. Topic 5 (Mixed Romantic Moments): 0.1090
  3. Topic 1 (Tenet - Action & Time): 0.1054
  4. Topic 9 (Titanic - Ship & Sea): 0.1043
  5. Topic 6 (General Action Scenes): 0.1000

SCI-FI - Top 5 Topics:
  1. Topic 4 (Interstellar - Space Mission): 0.1257
  2. Topic 8 (The Martian & Avatar Mix): 0.1189
  3. Topic 1 (Tenet - Action & Time): 0.1185
  4. Topic 9 (Titanic - Ship & Sea): 0.1057
  5. Topic 5 (Mixed Romantic Moments): 0.0984


# Experimentation

### Experiment 1: Stemming/Lemmatization

In [173]:
!pip install nltk

Collecting nltk
  Downloading nltk-3.9.2-py3-none-any.whl.metadata (3.2 kB)
Collecting regex>=2021.8.3 (from nltk)
  Downloading regex-2026.1.15-cp313-cp313-win_amd64.whl.metadata (41 kB)
Downloading nltk-3.9.2-py3-none-any.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 1.5/1.5 MB 19.6 MB/s  0:00:00
Downloading regex-2026.1.15-cp313-cp313-win_amd64.whl (277 kB)
Installing collected packages: regex, nltk

   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   -------------------- ------------------- 1/2 [nltk]
   ---------------


[notice] A new release of pip is available: 25.3 -> 26.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [174]:
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\merve\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [175]:
stemmer = PorterStemmer()

In [176]:
def stem_text(text):
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return ' '.join(stemmed_words)

In [177]:
romantic_docs_stemmed = [stem_text(doc.lower()) for doc in romantic_docs]
scifi_docs_stemmed = [stem_text(doc.lower()) for doc in scifi_docs]
all_docs_stemmed = romantic_docs_stemmed + scifi_docs_stemmed

In [179]:
vectorizer_stemmed = CountVectorizer(
    max_features=5000,
    stop_words='english',
    min_df=5,
    max_df=0.8,
    token_pattern=r'\b[a-zA-Z]{2,}\b'
)

In [180]:
bow_matrix_stemmed = vectorizer_stemmed.fit_transform(all_docs_stemmed)
feature_names_stemmed = vectorizer_stemmed.get_feature_names_out()

In [182]:
print(f"Stemmed BOW shape: {bow_matrix_stemmed.shape}")
print(f"Words count: {len(feature_names_stemmed)}")

Stemmed BOW shape: (11420, 2301)
Words count: 2301


In [183]:
lda_model_stemmed = LatentDirichletAllocation(
    n_components=10,
    random_state=42,
    max_iter=20,
    learning_method='online'
)

In [184]:
lda_model_stemmed.fit(bow_matrix_stemmed)

0,1,2
,n_components,10
,doc_topic_prior,
,topic_word_prior,
,learning_method,'online'
,learning_decay,0.7
,learning_offset,10.0
,max_iter,20
,batch_size,128
,evaluate_every,-1
,total_samples,1000000.0


In [185]:
print("\nTop 10 words per topic (STEMMED):")
for topic_idx, topic in enumerate(lda_model_stemmed.components_):
    top_indices = topic.argsort()[-10:][::-1]
    top_words = [feature_names_stemmed[i] for i in top_indices]
    print(f"\nTopic {topic_idx}: {', '.join(top_words)}")


Top 10 words per topic (STEMMED):

Topic 0: look, room, charli, kat, elizabeth, sit, moment, hi, patrick, mr

Topic 1: turn, hi, face, smile, feet, follow, girl, look, jake, littl

Topic 2: door, open, ha, mark, water, tri, close, hi, grace, way

Topic 3: come, thi, light, ship, look, time, hit, cooper, wa, quaritch

Topic 4: hi, like, pull, hand, int, black, deck, push, sator, cut

Topic 5: jake, hold, eye, begin, look, right, reach, start, stop, gun

Topic 6: hazel, stare, fall, roll, jake, hear, point, screen, gu, look

Topic 7: hi, protagonist, head, neil, arm, boat, drop, abov, glass, ive

Topic 8: rose, jack, run, watch, man, cal, past, know, hole, crew

Topic 9: cooper, hi, away, brand, look, case, step, check, just, wall


In [186]:
doc_topic_dist_stemmed = lda_model_stemmed.transform(bow_matrix_stemmed)

In [187]:
romantic_indices = np.where(labels_array == 'romantic')[0]
scifi_indices = np.where(labels_array == 'scifi')[0]

In [188]:
romantic_topic_dist_stemmed = doc_topic_dist_stemmed[romantic_indices].mean(axis=0)
scifi_topic_dist_stemmed = doc_topic_dist_stemmed[scifi_indices].mean(axis=0)

In [190]:
print("ROMANTIC - Top 5 Topics (STEMMED):")
top_5_romantic_stemmed = romantic_topic_dist_stemmed.argsort()[-5:][::-1]
for rank, topic_idx in enumerate(top_5_romantic_stemmed, 1):
    print(f"  {rank}. Topic {topic_idx}: {romantic_topic_dist_stemmed[topic_idx]:.4f}")

print("\nSCI-FI - Top 5 Topics (STEMMED):")
top_5_scifi_stemmed = scifi_topic_dist_stemmed.argsort()[-5:][::-1]
for rank, topic_idx in enumerate(top_5_scifi_stemmed, 1):
    print(f"  {rank}. Topic {topic_idx}: {scifi_topic_dist_stemmed[topic_idx]:.4f}")

ROMANTIC - Top 5 Topics (STEMMED):
  1. Topic 0: 0.1372
  2. Topic 8: 0.1197
  3. Topic 4: 0.1083
  4. Topic 2: 0.1046
  5. Topic 9: 0.0969

SCI-FI - Top 5 Topics (STEMMED):
  1. Topic 9: 0.1238
  2. Topic 2: 0.1134
  3. Topic 4: 0.1059
  4. Topic 5: 0.1054
  5. Topic 1: 0.0966


### Experiment 2: TF-IDF Representation

In [191]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [192]:
tfidf_vectorizer = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    min_df=5,
    max_df=0.8,
    token_pattern=r'\b[a-zA-Z]{2,}\b'
)

In [193]:
tfidf_matrix = tfidf_vectorizer.fit_transform(all_docs)
feature_names_tfidf = tfidf_vectorizer.get_feature_names_out()

In [195]:
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Words count: {len(feature_names_tfidf)}")

TF-IDF matrix shape: (11420, 2547)
Words count: 2547


In [196]:
lda_model_tfidf = LatentDirichletAllocation(
    n_components=10,
    random_state=42,
    max_iter=20,
    learning_method='online'
)

In [197]:
lda_model_tfidf.fit(tfidf_matrix)

0,1,2
,n_components,10
,doc_topic_prior,
,topic_word_prior,
,learning_method,'online'
,learning_decay,0.7
,learning_offset,10.0
,max_iter,20
,batch_size,128
,evaluate_every,-1
,total_samples,1000000.0


In [198]:
print("\nTop 10 words per topic (TF-IDF):")
for topic_idx, topic in enumerate(lda_model_tfidf.components_):
    top_indices = topic.argsort()[-10:][::-1]
    top_words = [feature_names_tfidf[i] for i in top_indices]
    print(f"\nTopic {topic_idx}: {', '.join(top_words)}")


Top 10 words per topic (TF-IDF):

Topic 0: case, opens, black, door, window, white, camera, going, planet, red

Topic 1: turns, protagonist, pulls, away, watches, kat, look, sees, takes, room

Topic 2: rose, eyes, light, goes, suddenly, gets, mr, puts, reaches, large

Topic 3: water, int, run, air, high, gun, young, come, picks, hits

Topic 4: elizabeth, mark, begins, man, steps, looking, moves, grace, right, wall

Topic 5: night, time, close, feet, little, old, floor, pushes, looks, walks

Topic 6: hand, face, way, starts, holds, tries, hole, small, arms, trying

Topic 7: brand, moment, grabs, cooper, jake, hands, head, stands, ice, checks

Topic 8: table, ground, screen, blue, runs, standing, falls, later, control, points

Topic 9: like, deck, watch, comes, boat, inside, ship, truck, tiny, massive


In [199]:
doc_topic_dist_tfidf = lda_model_tfidf.transform(tfidf_matrix)

In [200]:
romantic_topic_dist_tfidf = doc_topic_dist_tfidf[romantic_indices].mean(axis=0)
scifi_topic_dist_tfidf = doc_topic_dist_tfidf[scifi_indices].mean(axis=0)

In [201]:
print("ROMANTIC - Top 5 Topics (TF-IDF):")
top_5_romantic_tfidf = romantic_topic_dist_tfidf.argsort()[-5:][::-1]
for rank, topic_idx in enumerate(top_5_romantic_tfidf, 1):
    print(f"  {rank}. Topic {topic_idx}: {romantic_topic_dist_tfidf[topic_idx]:.4f}")

print("\nSCI-FI - Top 5 Topics (TF-IDF):")
top_5_scifi_tfidf = scifi_topic_dist_tfidf.argsort()[-5:][::-1]
for rank, topic_idx in enumerate(top_5_scifi_tfidf, 1):
    print(f"  {rank}. Topic {topic_idx}: {scifi_topic_dist_tfidf[topic_idx]:.4f}")

ROMANTIC - Top 5 Topics (TF-IDF):
  1. Topic 1: 0.1230
  2. Topic 9: 0.1113
  3. Topic 6: 0.1110
  4. Topic 5: 0.1089
  5. Topic 2: 0.1007

SCI-FI - Top 5 Topics (TF-IDF):
  1. Topic 1: 0.1216
  2. Topic 7: 0.1105
  3. Topic 5: 0.1057
  4. Topic 4: 0.1046
  5. Topic 0: 0.0999


### Experiment 3: Agresif Stopword Filtering

In [204]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [203]:
character_stopwords = list(character_names) 

In [205]:
custom_stopwords = list(ENGLISH_STOP_WORDS) + character_stopwords

In [206]:
vectorizer_aggressive = CountVectorizer(
    max_features=5000,
    stop_words=custom_stopwords,
    min_df=10,        
    max_df=0.7,       
    token_pattern=r'\b[a-zA-Z]{3,}\b'  
)

In [207]:
bow_matrix_aggressive = vectorizer_aggressive.fit_transform(all_docs)
feature_names_aggressive = vectorizer_aggressive.get_feature_names_out()

In [208]:
print(f"Aggressive BOW shape: {bow_matrix_aggressive.shape}")
print(f"Word count: {len(feature_names_aggressive)}")

Aggressive BOW shape: (11420, 1209)
Word count: 1209


In [209]:
lda_model_aggressive = LatentDirichletAllocation(
    n_components=10,
    random_state=42,
    max_iter=20,
    learning_method='online'
)

In [210]:
lda_model_aggressive.fit(bow_matrix_aggressive)

0,1,2
,n_components,10
,doc_topic_prior,
,topic_word_prior,
,learning_method,'online'
,learning_decay,0.7
,learning_offset,10.0
,max_iter,20
,batch_size,128
,evaluate_every,-1
,total_samples,1000000.0


In [211]:
print("\nTop 10 words per topic (AGGRESSIVE FILTERING):")
for topic_idx, topic in enumerate(lda_model_aggressive.components_):
    top_indices = topic.argsort()[-10:][::-1]
    top_words = [feature_names_aggressive[i] for i in top_indices]
    print(f"\nTopic {topic_idx}: {', '.join(top_words)}")


Top 10 words per topic (AGGRESSIVE FILTERING):

Topic 0: pulls, black, man, small, people, smiles, truck, nods, gives, huge

Topic 1: hand, hands, begins, starts, stands, comes, tries, gun, screen, glass

Topic 2: head, deck, walks, night, wall, old, little, arm, run, white

Topic 3: grabs, goes, time, house, checks, puts, trying, reaches, high, backwards

Topic 4: room, water, int, just, boat, suddenly, table, long, ground, day

Topic 5: sees, gets, stares, standing, pushes, phone, young, falls, later, sits

Topic 6: like, face, looking, look, turns, moment, ice, runs, stops, window

Topic 7: looks, away, watches, open, inside, moves, blue, going, holds, arms

Topic 8: door, opens, steps, right, past, watch, forward, come, hole, car

Topic 9: ship, eyes, takes, way, light, feet, close, large, crew, floor


In [212]:
doc_topic_dist_aggressive = lda_model_aggressive.transform(bow_matrix_aggressive)

In [213]:
romantic_topic_dist_aggressive = doc_topic_dist_aggressive[romantic_indices].mean(axis=0)
scifi_topic_dist_aggressive = doc_topic_dist_aggressive[scifi_indices].mean(axis=0)

In [214]:
print("ROMANTIC - Top 5 Topics (AGGRESSIVE):")
top_5_romantic_aggressive = romantic_topic_dist_aggressive.argsort()[-5:][::-1]
for rank, topic_idx in enumerate(top_5_romantic_aggressive, 1):
    print(f"  {rank}. Topic {topic_idx}: {romantic_topic_dist_aggressive[topic_idx]:.4f}")

print("\nSCI-FI - Top 5 Topics (AGGRESSIVE):")
top_5_scifi_aggressive = scifi_topic_dist_aggressive.argsort()[-5:][::-1]
for rank, topic_idx in enumerate(top_5_scifi_aggressive, 1):
    print(f"  {rank}. Topic {topic_idx}: {scifi_topic_dist_aggressive[topic_idx]:.4f}")

ROMANTIC - Top 5 Topics (AGGRESSIVE):
  1. Topic 4: 0.1149
  2. Topic 7: 0.1114
  3. Topic 2: 0.1035
  4. Topic 9: 0.1024
  5. Topic 6: 0.1018

SCI-FI - Top 5 Topics (AGGRESSIVE):
  1. Topic 9: 0.1167
  2. Topic 7: 0.1137
  3. Topic 6: 0.1126
  4. Topic 0: 0.1005
  5. Topic 1: 0.0979


In [220]:
comparison_data = {
    'Experiment': [
        'BASELINE (Counts)',
        'STEMMING',
        'TF-IDF',
        'AGGRESSIVE FILTERING'
    ],
    'Vocabulary Size': [
        len(feature_names),
        len(feature_names_stemmed),
        len(feature_names_tfidf),
        len(feature_names_aggressive)
    ],
    'Preprocessing': [
        'lowercase, stopwords, min_df=5, max_df=0.8',
        'stemming + baseline',
        'TF-IDF weights',
        'character names removed, min_df=10, max_df=0.7, min_length=3'
    ]
}

comparison_df = pd.DataFrame(comparison_data)
print(comparison_df.to_string(index=False))

print("\nTOP 3 TOPICS PER CATEGORY - COMPARISON")

experiments = {
    'BASELINE': (romantic_topic_dist, scifi_topic_dist),
    'STEMMING': (romantic_topic_dist_stemmed, scifi_topic_dist_stemmed),
    'TF-IDF': (romantic_topic_dist_tfidf, scifi_topic_dist_tfidf),
    'AGGRESSIVE': (romantic_topic_dist_aggressive, scifi_topic_dist_aggressive)
}

for exp_name, (rom_dist, sci_dist) in experiments.items():
    print(f"\n{exp_name}:")
    
    top_3_rom = rom_dist.argsort()[-3:][::-1]
    print(f"  ROMANTIC Top 3: {top_3_rom} ({rom_dist[top_3_rom[0]]:.3f}, {rom_dist[top_3_rom[1]]:.3f}, {rom_dist[top_3_rom[2]]:.3f})")
    
    top_3_sci = sci_dist.argsort()[-3:][::-1]
    print(f"  SCI-FI Top 3:   {top_3_sci} ({sci_dist[top_3_sci[0]]:.3f}, {sci_dist[top_3_sci[1]]:.3f}, {sci_dist[top_3_sci[2]]:.3f})")



          Experiment  Vocabulary Size                                                Preprocessing
   BASELINE (Counts)             2547                   lowercase, stopwords, min_df=5, max_df=0.8
            STEMMING             2301                                          stemming + baseline
              TF-IDF             2547                                               TF-IDF weights
AGGRESSIVE FILTERING             1209 character names removed, min_df=10, max_df=0.7, min_length=3

TOP 3 TOPICS PER CATEGORY - COMPARISON

BASELINE:
  ROMANTIC Top 3: [7 5 1] (0.120, 0.109, 0.105)
  SCI-FI Top 3:   [4 8 1] (0.126, 0.119, 0.119)

STEMMING:
  ROMANTIC Top 3: [0 8 4] (0.137, 0.120, 0.108)
  SCI-FI Top 3:   [9 2 4] (0.124, 0.113, 0.106)

TF-IDF:
  ROMANTIC Top 3: [1 9 6] (0.123, 0.111, 0.111)
  SCI-FI Top 3:   [1 7 5] (0.122, 0.111, 0.106)

AGGRESSIVE:
  ROMANTIC Top 3: [4 7 2] (0.115, 0.111, 0.103)
  SCI-FI Top 3:   [9 7 6] (0.117, 0.114, 0.113)
