# Pip Installs I Made

In [None]:
!pip install numpy
!pip install matplotlib
!pip install pandas
!pip install seaborn
!pip install scikit-learn
!pip install wordcloud
!pip install transformers
!pip install textblob
!pip install spacy
!pip install imblearn
!python -m spacy download en_core_web_sm

# Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

import nltk
nltk.download('stopwords')
nltk.download('wordnet')

from collections import Counter


# Load dataset

In [None]:
data = pd.read_csv('datasets/train.txt', sep='\t', header=None, names=['Title', 'Origin', 'Genre', 'Director', 'Plot'])

print(data.head())

print(f"{data.isnull().sum()}, {len(data.isnull())}")

print("Genre value counts")
print(data['Genre'].value_counts())


# Stop Words

In [None]:
# adds stop words from nltk
#stop_words = stopwords.words('english')

stop_words = ["the", "to", "of", "a", 'and', 'is', 'his', 'in', 'he', 
            'that', 'her', "with", "by", "for", "him", "the", "as", "who",
            "on", "she", "but", "from", "has", "they", "an", "at", "their", "are",
            "into", "he", "out", "it", "up", "be", "was", "when", "not", "them", "which",
            "then", "after", "about", "where", "one", "have", "When", "After", "tells", "him.",
            "back", "She", "will", "while", "all", "two", "In", "had", "been", "They",
            "get", "only", "also", "before", "off", "being", "As", "goes", "takes",
            "this", "other", "take", "tries", "A", "her.", "go", "gets", "can", "man", "so",
            "over", "through", "down", "help", "new", "him,", "now", "comes", "next", "himself",
            "later", "however", "away", "there", "during", "both", "first", "again", "no", "way", "own",
            "some", "another", "more", "becomes", "make", "does", "what", "begins", "meanwhile", "just",
            "asks", "if", "because", "soon", "having", "its", "eventually", "come", "still", "between", "father",
            "finds", "house", "home", "find"# TODO check if father should be here or not
            ]

# TODO keep adding from list below 
# also, add words from the top 10 most frequent words in each genre
'''
SHOULD WE CONSIDER THESE:
('finds', 3769), ('find', 3721), ('help', 2395), ('film', 2240)], ('leave', 2411), ('leaves', 2400), ('decides', 2220)
('meets', 1932), ('arrives', 1918), ('room', 1887), ('girl', 1877), ('return', 1874), ('group', 1859), ('sees', 1857), 
('dead', 1842), ('old', 1837), ('story', 1832), ('see', 1832), ('brother', 1806), ('each', 1791), ('three', 1775), 
('body', 1760), ('falls', 1754), ('finally', 1738), ('fight', 1738), ('reveals', 1690), ('school', 1684), 
('gang', 1662), ('wants', 1655), ('head', 1618), ('local', 1612), ('attempts', 1593), 
('gives', 1588), ('against', 1583), ('work', 1581), ('same', 1565), ('discovers', 1565), ('together', 1548)]
]'''

# Data Preprocessing

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.strip(' ')
    tokens = text.split()
    filtered = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered)


def clean_text(text):
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation + string.digits), '', text)
    text = re.sub('\s+', ' ', text).strip()
    return text

def clean_text2(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r"[^a-zA-Z]", " ", text)
    text = text.strip(' ')
    return text

def tokenize_and_remove_stopwords(text):
    tokens = text.split()
    filtered = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered)

def remove_stopwords(text):
    tokens = text.split()
    filtered = [word for word in tokens if word not in stop_words]
    return filtered

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
    tokens = text.split()
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(lemmatized)

# cleaning, tokenizing and removing stopwords TODO does it make sense to tokenize?
# even if we tokenize, does it make sense to tokenize like I'm doing?
data['Clean_Plot'] = data['Plot'].apply(preprocess_text)

#data['Clean_Plot'] = data['Clean_Plot'].apply(tokenize_and_remove_stopwords)

# lemmatization -- TODO does it make sense to lemmatize? I think not
#data['Clean_Plot'] = data['Clean_Plot'].apply(lemmatize_text)

# Writes the Clean Plot to a file for later use

In [None]:
# write to txt the data['Clean_Plot']
data['Clean_Plot'].to_csv('datasets/clean_plot_with_nonalpha.txt', index=False)

print(data.head())

# Check genre distribution

In [None]:
plt.figure(figsize=(12,6))
sns.countplot(y='Genre', data=data, order=data['Genre'].value_counts().index)
plt.title('Distribution of Movie Genres')
plt.xlabel('Count')
plt.ylabel('Genre')
plt.show()

# Word Cloud for Genres

In [8]:
#from wordcloud import WordCloud
#
#def generate_wordcloud(key):
#    genres = data['Genre'].unique()
#    for genre in genres:
#        text = ' '.join(data[data['Genre'] == genre][key])
#        wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)
#        plt.figure(figsize=(15, 7.5))
#        plt.imshow(wordcloud, interpolation='bilinear')
#        plt.title(f'Word Cloud for {genre} Movies in {key}')
#        plt.axis('off')
#        plt.show()
#
#generate_wordcloud('Plot')
#generate_wordcloud('Clean_Plot')

# Most Frequent Words for Genre (Not Word Cloud)

In [None]:

# def plot_top_words_by_genre(key, num_top_words=20):
#     genres = data['Genre'].unique()
#     plt.figure(figsize=(12, len(genres) * 5))

#     for idx, genre in enumerate(genres):
#         genre_data = data[data['Genre'] == genre][key]

#         vectorizer = CountVectorizer(stop_words='english')
#         genre_matrix = vectorizer.fit_transform(genre_data)

#         word_freq = pd.DataFrame(genre_matrix.toarray(), columns=vectorizer.get_feature_names_out()).sum(axis=0)
        
#         top_words = word_freq.nlargest(num_top_words)

#         plt.subplot(len(genres), 1, idx + 1)
#         plt.bar(top_words.index, top_words.values, color='skyblue')
#         plt.title(f'Top {num_top_words} Words for {genre} Genre in {key}')
#         plt.xlabel('Words')
#         plt.ylabel('Frequency')
#         plt.xticks(rotation=45)
#         #print(top_words)

#     plt.tight_layout()
#     plt.show()

#plot_top_words_by_genre('Plot')
#plot_top_words_by_genre('Clean_Plot')

# from the 50 most common words per genre, I want to get, for each genre, the ones that better differentiate it from the others

def get_top_words_by_genre(key, num_top_words=50):
    genres = data['Genre'].unique()
    top_words_by_genre = {}

    for genre in genres:
        genre_data = data[data['Genre'] == genre][key]

        vectorizer = CountVectorizer(stop_words='english')
        genre_matrix = vectorizer.fit_transform(genre_data)

        word_freq = pd.DataFrame(genre_matrix.toarray(), columns=vectorizer.get_feature_names_out()).sum(axis=0)
        
        top_words = word_freq.nlargest(num_top_words)
        top_words_by_genre[genre] = top_words

    return top_words_by_genre

# I want to visualize them
def plot_top_words_by_genre(top_words_by_genre, num_top_words=20):
    genres = data['Genre'].unique()
    plt.figure(figsize=(12, len(genres) * 5))

    for idx, genre in enumerate(genres):
        top_words = top_words_by_genre[genre]

        plt.subplot(len(genres), 1, idx + 1)
        plt.bar(top_words.index, top_words.values, color='skyblue')
        plt.title(f'Top {num_top_words} Words for {genre} Genre')
        plt.xlabel('Words')
        plt.ylabel('Frequency')
        plt.xticks(rotation=45)

    plt.tight_layout()
    plt.show()

top_words_by_genre = get_top_words_by_genre('Clean_Plot')
plot_top_words_by_genre(top_words_by_genre)

# add unique words as a new feature for each genre
def add_unique_words(data, top_words_by_genre):
    for genre in data['Genre'].unique():
        data[f'{genre}_unique_words'] = data['Clean_Plot'].apply(lambda x: len(set(x.split()) & set(top_words_by_genre[genre].index)))
    
# how can I visualize this?
add_unique_words(data, top_words_by_genre)
print(data.head())

# Plot Length Distribution

In [None]:
def plot_length(key):
    data['Plot_Length'] = data[key].apply(lambda x: len(x.split()))
    plt.figure(figsize=(12,6))
    sns.boxplot(x='Genre', y='Plot_Length', data=data)
    plt.title(f'Plot Length Distribution by Genre in {key}')
    plt.xlabel('Genre')
    plt.ylabel('Number of Words in Plot')
    plt.xticks(rotation=45)
    plt.show()
    print(data.groupby('Genre')['Plot_Length'].describe())

plot_length('Clean_Plot')

# Frequency Analysis of Words

In [None]:

def freq_analysis(key):
    # print the top 100 most frequent words
    all_words = ' '.join(data[key]).split()
    word_freq = Counter(all_words)
    common_words = word_freq.most_common(100)
    print(common_words)

    # plot the top 20 most frequent words
    all_words = ' '.join(data[key]).split()
    word_freq = Counter(all_words)
    common_words = word_freq.most_common(20)
    df_common_words = pd.DataFrame(common_words, columns=['Word', 'Frequency'])

    plt.figure(figsize=(12,6))
    sns.barplot(x='Frequency', y='Word', data=df_common_words)
    plt.title(f'Top 20 Most Frequent Words in {key}')
    plt.xlabel('Frequency')
    plt.ylabel('Word')
    plt.show()
    
freq_analysis('Clean_Plot')


# Term Frequency Heatmap

In [None]:
def term_frequency_heatmap(key):
    
    vectorizer = CountVectorizer(stop_words=stop_words, max_features=1000)
    dtm = vectorizer.fit_transform(data[key])
    dtm_df = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out())

    genre_term_freq = dtm_df.groupby(data['Genre']).mean()

    top_words = dtm_df.sum().nlargest(20).index
    genre_term_freq_top = genre_term_freq[top_words]

    plt.figure(figsize=(12,8))
    sns.heatmap(genre_term_freq_top, annot=True, fmt=".2f", cmap="YlGnBu")
    plt.title(f'Average Term Frequencies per Genre in {key}')
    plt.show()
    
term_frequency_heatmap('Clean_Plot')

# Sentiment Analysis

In [13]:
from textblob import TextBlob
from scipy.sparse import hstack
from sklearn.metrics import classification_report, confusion_matrix

def get_sentiment_polarity(text):
    return TextBlob(text).sentiment.polarity
def get_sentiment_subjectivity(text):
    return TextBlob(text).sentiment.subjectivity

data['Sentiment_polarity'] = data['Plot'].apply(get_sentiment_polarity)
data['Sentiment_subjectivity'] = data['Plot'].apply(get_sentiment_subjectivity)

def plot_sentiment_by_genre(modifier):
    # Set the size of the plot
    plt.figure(figsize=(10, 6))
    
    # Create a boxplot of sentiment by genre
    sns.boxplot(x='Genre', y=modifier, data=data)
    
    # Customize the plot
    plt.title('Sentiment Distribution by Genre')
    plt.xlabel('Genre')
    plt.ylabel(modifier)
    plt.xticks(rotation=45)  # Rotate x-axis labels if necessary for readability
    
    # Show the plot
    plt.tight_layout()
    plt.show()

plot_sentiment_by_genre('Sentiment_polarity')
plot_sentiment_by_genre('Sentiment_subjectivity')

# Named Entity Counts (Experiment!)

In [14]:
#import spacy
#
#nlp = spacy.load('en_core_web_sm')
#
#def count_entities(text):
#    doc = nlp(text)
#    return len(doc.ents)
#
#data['Entity_Count'] = data['Plot'].apply(count_entities)

# POS Tagging (Experiment!)

In [15]:
#def pos_counts(text):
#    doc = nlp(text)
#    pos_counts = doc.count_by(spacy.attrs.POS)
#    return pos_counts
#
#data['POS_Counts'] = data['Plot'].apply(pos_counts)

# Actor Extraction (Experiment!)

In [16]:
#import spacy
#
#nlp = spacy.load("en_core_web_sm")
#
## Define a function to extract proper nouns (names) from the plot
#def extract_names(text):
#    doc = nlp(text)
#    return [ent.text for ent in doc.ents if ent.label_ in ['PERSON']]
#
## Apply the function and get unique names
#data['Extracted_Names'] = data['Plot'].apply(lambda x: extract_names(x) if pd.notnull(x) else [])
#unique_names = set([name for sublist in data['Extracted_Names'] for name in sublist])
#print(f"Number of unique names extracted from the plot: {len(unique_names)}")

# Check if these names are actors by cross-referencing with a known list of actors (if available)


# Add Genre-Specific Keywords (Experiment!)

In [17]:
#horror_keywords = []
#
#def contains_horror_keywords(text):
#    return int(any(word in text for word in horror_keywords))
#
#data['Horror_Keywords'] = data['Clean_Plot'].apply(contains_horror_keywords)
#
#print(data['Horror_Keywords'])#

# Encode Genre

In [18]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
data['Genre_Label'] = encoder.fit_transform(data['Genre'])

# Above are shitty models that I've played with for fun

# Feature Extraction and Model Building (Multinomial Naive Bayes)

In [None]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_selection import chi2, SelectKBest
from scipy.sparse import hstack

tfidf = TfidfVectorizer(max_features=7000, min_df=20, stop_words=stop_words, ngram_range=(1, 3))
X = tfidf.fit_transform(data['Clean_Plot']).toarray()

label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['Genre'])
chi2_selector = SelectKBest(chi2, k=5000)
X_kbest = chi2_selector.fit_transform(X, y)
# we need to apply some sampling technique because the class imbalance is affecting the results
ros = RandomOverSampler(random_state=42)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
#X_train, y_train = ros.fit_resample(X_train, y_train)

nb_classifier = MultinomialNB(alpha=0.1)
nb_classifier.fit(X_train, y_train)

y_pred = nb_classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

conf_mat = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(conf_mat, annot=True, fmt='d', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# BERT

In [20]:
import warnings
warnings.filterwarnings("ignore")

import logging
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
%%time

#!pip install torch
#!pip install tokenizers
!pip install transformers==4.6.0
#!pip install simpletransformers

In [None]:
import torch, transformers, tokenizers
torch.__version__, transformers.__version__, tokenizers.__version__

In [23]:
from sklearn.preprocessing import LabelEncoder

X_train, X_val, y_train, y_val = train_test_split(
    data['Plot'], data['Genre_Label'], 
    test_size=0.2, 
    random_state=42, 
    stratify=data['Genre_Label']
)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)


In [24]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(
    X_train.tolist(), 
    truncation=True, 
    padding=True, 
    max_length=512
)
val_encodings = tokenizer(
    X_val.tolist(), 
    truncation=True, 
    padding=True, 
    max_length=512
)

In [25]:
import torch

class MovieGenreDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.reset_index(drop=True)
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels.iloc[idx])
        return item
    def __len__(self):
        return len(self.labels)


In [None]:
from transformers import BertForSequenceClassification

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', 
    num_labels=len(encoder.classes_)
)


In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # Output directory
    num_train_epochs=3,              # Total number of training epochs
    per_device_train_batch_size=8,   # Batch size per device during training
    per_device_eval_batch_size=8,    # Batch size for evaluation
    warmup_steps=500,                # Number of warmup steps
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',            # Directory for storing logs
    evaluation_strategy='epoch',     # Evaluation strategy
    save_strategy='epoch',
    load_best_model_at_end=True,     # Load best model at end of training
    metric_for_best_model='accuracy',
    greater_is_better=True
)


# K-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix

skf = StratifiedKFold(n_splits=5)
scores = cross_val_score(nb_classifier, X, y, cv=skf, scoring='accuracy')

print("Cross-validation scores:", scores)
print("Mean accuracy:", scores.mean())


y_pred = nb_classifier.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))

conf_mat = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8,6))
sns.heatmap(conf_mat, annot=True, fmt='d', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Check Misclassified Samples

In [None]:
misclassified_indices = np.where(y_test != y_pred)[0]
print(len(misclassified_indices))
for idx in misclassified_indices[:5]:
    print(f"Plot: {data.iloc[idx]['Plot']}")
    print(f"Actual Genre: {label_encoder.inverse_transform([y_test[idx]])[0]}")
    print(f"Predicted Genre: {label_encoder.inverse_transform([y_pred[idx]])[0]}")
    print("-" * 80)


# Support Vector Classification

In [None]:
svc_classifier = SVC(kernel='linear')
svc_classifier.fit(X_train, y_train)

y_pred_svc = svc_classifier.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred_svc))
print("Classification Report:\n", classification_report(y_test, y_pred_svc, target_names=label_encoder.classes_))


# Info for GPT models (useful for io-preview)

In [None]:
# Print info about Genre Distribution
print("### Genre Distribution Analysis ###")
genre_counts = data['Genre'].value_counts()
print("The dataset contains the following number of movies per genre:\n")
for genre, count in genre_counts.items():
    print(f"Genre: {genre}, Count: {count}")
print("\nThis shows the overall distribution of genres in the dataset, highlighting the most and least represented genres.")

# Print info about the Most Frequent Words per Genre in Plot and Clean_Plot
def print_top_words_info(key, num_top_words=20):
    print(f"\n### Most Frequent Words Analysis for {key} ###")
    genres = data['Genre'].unique()
    for genre in genres:
        genre_data = data[data['Genre'] == genre][key]
        vectorizer = CountVectorizer(stop_words='english')
        genre_matrix = vectorizer.fit_transform(genre_data)
        word_freq = pd.DataFrame(genre_matrix.toarray(), columns=vectorizer.get_feature_names_out()).sum(axis=0)
        top_words = word_freq.nlargest(num_top_words)
        print(f"\nTop {num_top_words} words for {genre} genre in {key}:\n{top_words}\n")
    print(f"This analysis identifies the most common words used in each genre's movie plot descriptions.")

print_top_words_info('Clean_Plot')

# Print info about Plot Length Distribution per Genre in Plot and Clean_Plot
def print_plot_length(key):
    data['Plot_Length'] = data[key].apply(lambda x: len(x.split()))
    print(data.groupby('Genre')['Plot_Length'].describe())

print_plot_length('Clean_Plot')

# Print info about the Top 20 Most Frequent Words in Plot and Clean_Plot
def print_top_100_words_info(key):
    print(f"\n### Top 100 Most Frequent Words Analysis for {key} ###")
    all_words = ' '.join(data[key]).split()
    word_freq = Counter(all_words)
    common_words = word_freq.most_common(100)
    print(f"The top 100 most frequent words in {key} are:\n{common_words}\n")
    print("This analysis highlights the most common words in the dataset, giving an idea of the typical vocabulary used in movie plot descriptions.")

print_top_100_words_info('Clean_Plot')

# Print info about the Term Frequency Heatmap per Genre in Plot and Clean_Plot
def print_term_frequency_heatmap_info(key):
    print(f"\n### Term Frequency Heatmap Analysis for {key} ###")
    vectorizer = CountVectorizer(stop_words=stop_words, max_features=1000)
    dtm = vectorizer.fit_transform(data[key])
    dtm_df = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out())
    genre_term_freq = dtm_df.groupby(data['Genre']).mean()
    top_words = dtm_df.sum().nlargest(20).index
    print(f"The average term frequencies for the top 20 words per genre in {key} are as follows:\n")
    print(genre_term_freq[top_words])
    print("The heatmap provides a visual representation of word frequency patterns across genres, which helps to understand which words are more prevalent in each genre.")

print_term_frequency_heatmap_info('Clean_Plot')