### plot_summaries.txt

In [5]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Download required NLTK data
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # Required for newer versions of NLTK

# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load TXT file assuming tab-separated values
df = pd.read_csv('archive\MovieSummaries\plot_summaries.txt', sep='\t', header=None, names=['id', 'summary'])

# Preprocessing function with type and null check
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    
    # Lowercase
    text = text.lower()

    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-z\s]', '', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return ' '.join(tokens)

# Apply preprocessing
df['clean_summary'] = df['summary'].apply(preprocess_text)

# Save or inspect the output
df.to_csv('preprocessed_plot_summaries.txt', sep='\t', index=False)
print(df.head())


  df = pd.read_csv('archive\MovieSummaries\plot_summaries.txt', sep='\t', header=None, names=['id', 'summary'])
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...


                 id                                            summary  \
0  movie_ID summary                                                NaN   
1          23890098  Shlykov, a hard-working taxi driver and Lyosha...   
2          31186339  The nation of Panem consists of a wealthy Capi...   
3          20663735  Poovalli Induchoodan  is sentenced for six yea...   
4           2231378  The Lemon Drop Kid , a New York City swindler,...   

                                       clean_summary  
0                                                     
1  shlykov hardworking taxi driver lyosha saxopho...  
2  nation panem consists wealthy capitol twelve p...  
3  poovalli induchoodan sentenced six year prison...  
4  lemon drop kid new york city swindler illegall...  


### movie.metadata.tsv

In [None]:
import pandas as pd
import re
import ast
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure NLTK resources are available
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Load the TSV file
df = pd.read_csv('archive\MovieSummaries\movie.metadata.tsv', sep='\t', header=None)
df.columns = ['id', 'fb_id', 'title', 'release_date', 'revenue', 'runtime', 'languages', 'countries', 'genres']

# Function to extract genre names from dictionary-like strings
def extract_dict_values(cell):
    try:
        parsed = ast.literal_eval(cell)
        return list(parsed.values())
    except (ValueError, SyntaxError):
        return []

# Apply genre extraction
df['genre_list'] = df['genres'].apply(extract_dict_values)

# Full preprocessing for the movie title
def preprocess_title(text):
    if not isinstance(text, str):
        return ''
    
    # Lowercase
    text = text.lower()

    # Remove special characters, numbers, punctuation
    text = re.sub(r'[^a-z\s]', ' ', text)

    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords
    tokens = [t for t in tokens if t not in stop_words]

    # Lemmatize
    tokens = [lemmatizer.lemmatize(t) for t in tokens]

    return ' '.join(tokens)

# Apply to title
df['clean_title'] = df['title'].apply(preprocess_title)

# Save only useful columns
df[['id', 'clean_title', 'genre_list']].to_csv('preprocessed_movie_metadata.tsv', sep='\t', index=False)

# Preview
print(df[['id', 'clean_title', 'genre_list']].head())


  df = pd.read_csv('archive\MovieSummaries\movie.metadata.tsv', sep='\t', header=None)
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


         id                                clean_title  \
0    975900                                  ghost mar   
1   3196793  getting away murder jonben ramsey mystery   
2  28463795                                brun bitter   
3   9363483                                  white eye   
4    261236                                woman flame   

                                          genre_list  
0  [Thriller, Science Fiction, Horror, Adventure,...  
1   [Mystery, Biographical film, Drama, Crime Drama]  
2                             [Crime Fiction, Drama]  
3  [Thriller, Erotic thriller, Psychological thri...  
4                                            [Drama]  


### character.metadata.tsv

In [9]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure NLTK is ready
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Init tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

# Load TSV file
df = pd.read_csv('archive\MovieSummaries\character.metadata.tsv', sep='\t', header=None)
df.columns = [
    'movie_id', 'fb_movie_id', 'release_date', 'character_name', 'actor_dob',
    'actor_gender', 'actor_height', 'actor_ethnicity', 'actor_name',
    'actor_age_at_release', 'fb_char_actor_map_id', 'fb_character_id', 'fb_actor_id'
]

# Preprocess relevant columns
df['clean_character_name'] = df['character_name'].apply(preprocess_text)
df['clean_actor_name'] = df['actor_name'].apply(preprocess_text)

# Save relevant columns
df[['movie_id', 'clean_character_name', 'clean_actor_name']].to_csv('preprocessed_character_metadata.tsv', sep='\t', index=False)

# Preview
print(df[['movie_id', 'clean_character_name', 'clean_actor_name']].head())


  df = pd.read_csv('archive\MovieSummaries\character.metadata.tsv', sep='\t', header=None)
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


   movie_id        clean_character_name    clean_actor_name
0    975900                    akooshay      wanda de jesus
1    975900  lieutenant melanie ballard  natasha henstridge
2    975900         desolation williams            ice cube
3    975900          sgt jericho butler       jason statham
4    975900             bashira kincaid         clea duvall


### tvtropes.cluster.txt

In [14]:
import pandas as pd
import json
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure NLTK is ready
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Init tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

# Load and process the file line-by-line
records = []
with open('archive/MovieSummaries/tvtropes.clusters.txt', 'r', encoding='utf-8') as file:
    for line in file:
        if '\t' not in line:
            continue
        trope, data_str = line.strip().split('\t', 1)
        try:
            data = json.loads(data_str.replace("'", '"'))  # Ensure valid JSON
            record = {
                'original_trope': trope,
                'original_char': data.get('char', ''),
                'original_movie': data.get('movie', ''),
                'original_actor': data.get('actor', ''),
                'id': data.get('id', '')
            }
            # Preprocess fields
            record['clean_trope'] = preprocess_text(trope)
            record['clean_char'] = preprocess_text(data.get('char', ''))
            record['clean_movie'] = preprocess_text(data.get('movie', ''))
            record['clean_actor'] = preprocess_text(data.get('actor', ''))
            records.append(record)
        except json.JSONDecodeError:
            continue  # Skip malformed rows

# Convert to DataFrame
df = pd.DataFrame(records)

# Save to TSV
df[['id', 'clean_trope', 'clean_char', 'clean_movie', 'clean_actor']].to_csv('preprocessed_tvtropes_clusters.txt', sep='\t', index=False)

# Preview
print(df[['id', 'clean_trope', 'clean_char', 'clean_movie', 'clean_actor']].head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


           id              clean_trope                 clean_char  \
0   /m/0jy9q0  absent minded professor  professor philip brainard   
1  /m/02vchl3  absent minded professor         professor keenbean   
2   /m/0k6fkc  absent minded professor          dr reinhardt lane   
3   /m/0k6_br  absent minded professor          dr harold medford   
4   /m/0k3rhh  absent minded professor             daniel jackson   

   clean_movie      clean_actor  
0      flubber   robin williams  
1  richie rich  michael mcshane  
2       shadow     ian mckellen  
3                  edmund gwenn  
4     stargate     james spader  


### name.cluster.txt

In [13]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Ensure NLTK resources are downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize preprocessing tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

# Read and preprocess the file
processed_data = []
with open('archive/MovieSummaries/name.clusters.txt', 'r', encoding='utf-8') as f:
    for line in f:
        if '\t' not in line:
            continue
        name, id_ = line.strip().split('\t', 1)
        processed_data.append({
            'original_name': name,
            'id': id_,
            'clean_name': preprocess_text(name)
        })

# Create DataFrame
df = pd.DataFrame(processed_data)

# Save to TSV
df[['id', 'clean_name']].to_csv('preprocessed_name_clusters.txt', sep='\t', index=False)

# Preview
print(df.head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


   original_name         id     clean_name
0  Stuart Little  /m/0k3w9c  stuart little
1  Stuart Little  /m/0k3wcx  stuart little
2  Stuart Little  /m/0k3wbn  stuart little
3       John Doe  /m/0jyg35       john doe
4       John Doe  /m/0k2_zn       john doe


### metadata extraction / genre classification

In [1]:
import pandas as pd
import ast
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK components
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Setup for text cleaning
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Text preprocessing function
def preprocess_text(text):
    if not isinstance(text, str):
        return ''
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)  # Remove special chars/numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize spaces
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [lemmatizer.lemmatize(t) for t in tokens]
    return ' '.join(tokens)

# Load and preprocess plot summaries
plot_df = pd.read_csv('archive/MovieSummaries/plot_summaries.txt', sep='\t', names=['wiki_id', 'summary'])
plot_df['summary'] = plot_df['summary'].apply(preprocess_text)

# Load and extract genres from movie metadata
metadata_df = pd.read_csv('archive/MovieSummaries/movie.metadata.tsv', sep='\t', header=None)
metadata_df = metadata_df[[0, 8]]  # Keep only Wikipedia ID and genres
metadata_df.columns = ['wiki_id', 'genres']

# Parse genre stringified dicts
def extract_genres(genre_dict_str):
    try:
        genre_dict = ast.literal_eval(genre_dict_str)
        if isinstance(genre_dict, dict):
            return list(genre_dict.values())  # Extract only genre names
        return []
    except:
        return []

metadata_df['genres'] = metadata_df['genres'].apply(extract_genres)

# Merge on Wikipedia Movie ID
merged_df = pd.merge(plot_df, metadata_df, on='wiki_id', how='inner')

# Remove entries with empty summaries or genres
merged_df = merged_df[(merged_df['summary'].str.strip() != '') & (merged_df['genres'].map(len) > 0)]

# Save to final TSV
merged_df.to_csv('cleaned_movie_data_for_genre_classification.tsv', sep='\t', index=False)

# Preview
print(merged_df.head())


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


    wiki_id                                            summary  \
0  23890098  shlykov hard working taxi driver lyosha saxoph...   
1  31186339  nation panem consists wealthy capitol twelve p...   
2  20663735  poovalli induchoodan sentenced six year prison...   
3   2231378  lemon drop kid new york city swindler illegall...   
4    595909  seventh day adventist church pastor michael ch...   

                                              genres  
0                              [Drama, World cinema]  
1  [Action/Adventure, Science Fiction, Action, Dr...  
2                [Musical, Action, Drama, Bollywood]  
3                         [Screwball comedy, Comedy]  
4  [Crime Fiction, Drama, Docudrama, World cinema...  


### train - test split

In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

# Load cleaned dataset
df = pd.read_csv('cleaned_movie_data_for_genre_classification.tsv', sep='\t')

# Convert string genre list to actual list
import ast
df['genres'] = df['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

# Binarize multi-label genres
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(df['genres'])
genre_df = pd.DataFrame(genre_matrix, columns=mlb.classes_)

# Combine with original data
df_combined = pd.concat([df[['wiki_id', 'summary']], genre_df], axis=1)

# Train-test split (stratification not directly supported for multi-label, but we shuffle)
train_df, test_df = train_test_split(df_combined, test_size=0.2, random_state=42, shuffle=True)

# Save splits
train_df.to_csv('train_genre_dataset.tsv', sep='\t', index=False)
test_df.to_csv('test_genre_dataset.tsv', sep='\t', index=False)

# Optional: Save label encoder for later inverse-transform
import pickle
with open('genre_mlb.pkl', 'wb') as f:
    pickle.dump(mlb, f)

# Preview
print("Training set size:", train_df.shape)
print("Test set size:", test_df.shape)


Training set size: (33434, 365)
Test set size: (8359, 365)
