In [11]:
import os
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk import pos_tag, ne_chunk
from nltk.stem import WordNetLemmatizer
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
from transformers import BertTokenizer, BertModel, pipeline
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [12]:
# Ensure you have the required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nataliaclark/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nataliaclark/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/nataliaclark/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nataliaclark/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/nataliaclark/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     /Users/nataliaclark/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [13]:
# Function to read scripts from files and extract titles
def load_scripts_from_directory(directory_path):
    scripts = []
    titles = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):
            # Construct full file path
            file_path = os.path.join(directory_path, filename)
            # Read the content of the file
            with open(file_path, 'r', encoding='utf-8') as file:
                content = file.read()
                scripts.append(content)
                # Extract title from the filename (remove "Script_" and ".txt")
                title = filename.replace("Script_", "").replace(".txt", "").replace("_", " ")
                titles.append(title)
    return titles, scripts

In [14]:
# Function to preprocess scripts with advanced techniques
def preprocess_scripts_advanced(scripts):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    preprocessed_scripts = []
    for script in scripts:
        # Tokenize script into sentences
        sentences = sent_tokenize(script.lower())
        filtered_script = []
        for sentence in sentences:
            # Tokenize each sentence into words
            word_tokens = word_tokenize(sentence)
            # Remove stopwords and non-alphanumeric tokens, and lemmatize
            filtered_sentence = []
            for word, pos in pos_tag(word_tokens):
                if word.isalnum() and word.lower() not in stop_words:
                    if pos.startswith('NN'):  # Noun
                        filtered_sentence.append(lemmatizer.lemmatize(word, pos='n'))
                    elif pos.startswith('VB'):  # Verb
                        filtered_sentence.append(lemmatizer.lemmatize(word, pos='v'))
                    elif pos.startswith('JJ'):  # Adjective
                        filtered_sentence.append(lemmatizer.lemmatize(word, pos='a'))
                    elif pos.startswith('RB'):  # Adverb
                        filtered_sentence.append(lemmatizer.lemmatize(word, pos='r'))
                    else:
                        filtered_sentence.append(word)
            filtered_script.append(' '.join(filtered_sentence))
        preprocessed_scripts.append(' '.join(filtered_script))
    return preprocessed_scripts

In [15]:
# Path to the directory containing the script files
directory_path = "dropbox-archive/movie_scripts/"

# Load scripts and titles
titles, scripts = load_scripts_from_directory(directory_path)

# Preprocess the scripts
preprocessed_scripts = preprocess_scripts_advanced(scripts)

KeyboardInterrupt: 

In [8]:
# add comparison of the original and preprocessed script for one example 
print("Original script:")
print(scripts[0])
print("\n\nPreprocessed script:")
print(preprocessed_scripts[0])

Original script:


Preprocessed script:


In [9]:
# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
# Function to extract BERT features
def extract_features(scripts, tokenizer, model):
    features = []
    for script in scripts:
        inputs = tokenizer(script, return_tensors='pt', max_length=512, truncation=True, padding='max_length')
        with torch.no_grad():
            outputs = model(**inputs)
        features.append(outputs.last_hidden_state.mean(dim=1).squeeze().numpy())
    return features

# Extract features from preprocessed scripts
features = extract_features(preprocessed_scripts, tokenizer, model)

In [None]:
# Perform KMeans clustering
num_clusters = 10
kmeans = KMeans(n_clusters=num_clusters, random_state=0)
kmeans.fit(features)
cluster_labels = kmeans.labels_

In [None]:
# Load pre-trained sentiment analysis pipeline
sentiment_pipeline = pipeline('sentiment-analysis')

In [None]:
# Function to perform sentiment analysis on scripts
def analyze_script_sentiment(scripts):
    sentiment_scores = []
    for script in scripts:
        sentences = sent_tokenize(script)
        sentiments = [sentiment_pipeline(sentence)[0] for sentence in sentences]
        positive_scores = [s['score'] for s in sentiments if s['label'] == 'POSITIVE']
        negative_scores = [s['score'] for s in sentiments if s['label'] == 'NEGATIVE']
        overall_sentiment = sum(positive_scores) - sum(negative_scores)
        sentiment_scores.append(overall_sentiment)
    return sentiment_scores

# Analyze sentiments of the preprocessed scripts
script_sentiments = analyze_script_sentiment(preprocessed_scripts)

In [None]:
# Extracting cosine scores for each movie
def recommend_movies_with_scores(user_input, tokenizer, model, features, titles, script_sentiments):
    user_input_features = extract_features([user_input], tokenizer, model)[0]
    user_input_sentiment = analyze_script_sentiment([user_input])[0]
    # Calculate cosine similarity between user input and movie features
    similarities = cosine_similarity([user_input_features], features)
    sentiment_differences = [abs(user_input_sentiment - sentiment) for sentiment in script_sentiments]
    combined_scores = similarities[0] - np.array(sentiment_differences)
    recommended_indices = combined_scores.argsort()[-5:][::-1]
    recommended_movies = [(titles[i], combined_scores[i]) for i in recommended_indices]
    return recommended_movies

In [None]:
def input_to_recs(input):
    recommended_movies = recommend_movies_with_scores(input, tokenizer, model, features, titles, script_sentiments)
    return recommended_movies

In [None]:
# Function to plot the movies with their similarity scores 
def plot_movies_with_scores(recommended_movies):
    # Extract movie titles and similarity scores
    movie_titles = [movie[0] for movie in recommended_movies]
    similarity_scores = [movie[1] for movie in recommended_movies]

    # Create a DataFrame for plotting
    df = pd.DataFrame({'Movie': movie_titles, 'Similarity Score': similarity_scores})

    # Plot the DataFrame
    df.plot(x='Movie', y='Similarity Score', kind='barh', color='skyblue', legend=False)
    plt.xlabel('Similarity Score')
    plt.title('Recommended Movies with Similarity Scores')
    plt.gca().invert_yaxis()
    plt.show()

In [None]:
# Example user input for movie description
user_input = "A movie about a group of friends who go on a road trip."
print("Query:", user_input, "\nRecommended Movies:", input_to_recs(user_input))