# Importing Dependences

In [21]:
!pip install gdown



In [22]:
import pandas as pd
import numpy as np 
import os
import gdown
import zipfile
import shutil
import re

In [23]:
gdrive_url = 'https://drive.google.com/uc?id=1jmSta-CD03w341lOzNjT_1qAeNDt2zL2'
output_path = 'archive.zip'
output_folder_path = "archive"
extract_to_folder = "c:\\temp_extract"

# Loading Dataset

In [24]:
def extract_zip(zip_file_path, extract_to):
    # Extract the zip file if it exists and is valid
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
            print(f"Extracted all files to {extract_to}")
    except FileNotFoundError:
        print(f"Error: File {zip_file_path} does not exist.")
    except zipfile.BadZipFile:
        print("Error: The file is not a valid zip file.")
    except Exception as e:
        print(f"An error occurred: {e}")

def download_and_extract(gdrive_url, output_path, extract_to_folder, output_folder_path):
    # Download the file
    try:
        gdown.download(gdrive_url, output_path, quiet=False)
        print(f"File downloaded and saved as {output_path}")
        
        # Extract the downloaded zip file
        extract_zip(output_path, extract_to_folder)

        # Move the extracted folder and clean up
        shutil.move(extract_to_folder, output_folder_path)
        os.remove(output_path)
        print(f"Moved extracted files to {output_folder_path} and removed {output_path}")
    except Exception as e:
        print(f"An error occurred during the download or extraction process: {e}")

download_and_extract(gdrive_url, output_path, extract_to_folder, output_folder_path)

Downloading...
From (original): https://drive.google.com/uc?id=1jmSta-CD03w341lOzNjT_1qAeNDt2zL2
From (redirected): https://drive.google.com/uc?id=1jmSta-CD03w341lOzNjT_1qAeNDt2zL2&confirm=t&uuid=f2b37503-8ecc-48b4-99f5-b6eaf55a34f5
To: c:\Users\syahr\OneDrive\Desktop\Portfolio\36118_NLP_Spring\archive.zip
100%|██████████| 689M/689M [01:48<00:00, 6.38MB/s] 


File downloaded and saved as archive.zip
Extracted all files to c:\temp_extract
Moved extracted files to archive and removed archive.zip


# Data Exploration

In [25]:
# Define the correct path for the screenplay data
texts_path = os.path.join(output_folder_path, "screenplay_data", "data", "raw_texts", "raw_texts")

# Initialize dictionary to store file names and contents
screenplays = {}

# List and iterate over all files in the folder
for file_name in os.listdir(texts_path):
    file_path = os.path.join(texts_path, file_name)
    
    # Ensure the path is an actual file before reading
    if os.path.isfile(file_path):
        
        try:
            # Read and store file content
            with open(file_path, 'r', encoding='latin-1') as f:
                screenplays[file_name] = f.read()
        except Exception as e:
            print(f"Error reading {file_name}: {e}")

# Print a sample of the first ten files to check
for i, (file_name, content) in enumerate(screenplays.items()):
    if i == 10:
        break
    print(f"Example of {file_name}:\n")
    print(content[:100])  # Print the first 100 characters as a sample
    print("-" * 50)

Example of 10 Cloverfield Lane_1179933.txt:

The Cellar

by
Josh Campbell & Matt Stuecken
DARKNESS

And then --

A GUNNED ENGINE --

BLURRED HEA
--------------------------------------------------
Example of 10 Things I Hate About You_0147800.txt:


                               TEN THINGS I HATE ABOUT YOU
          
                written by Ka
--------------------------------------------------
Example of 101 Days of 101 Dalmatians_0249328.txt:

107
#2
40] _DALMATIANS MARCH 17, 1995

EXT. SKY. FULL MOON

A huge, yellow moon. CAMERA TILTS DOWN 
--------------------------------------------------
Example of 12 Angry Men_0118528.txt:

PLEASE COPY AND RETURN |

âââ_ââââ_

 

TWELVE ANGRY MEN

by Reginald Rose

THE WRITIN
--------------------------------------------------
Example of 12 Monkeys_0114746.txt:


				TWELVE MONKEYS
	    
		          An original screenplay by

				David Peoples
  				     &amp;

--------------------------------------------------
Example of 12 Yea

In [26]:
# Set display option for better visibility
pd.set_option('display.max_columns', 25)

# Define the path to the metadata CSV file
csv_path = os.path.join(output_folder_path, 'movie_metadata', 'movie_meta_data.csv')

# Ensure the file exists before trying to read
if os.path.exists(csv_path):
    # Read the CSV file
    meta_df = pd.read_csv(csv_path)
    
    # Print the column names
    print(meta_df.columns)
    
    # Display the first few rows of the dataframe
    print(meta_df.head())
else:
    print(f"File {csv_path} does not exist.")

Index(['imdbid', 'title', 'akas', 'year', 'metascore', 'imdb user rating',
       'number of imdb user votes', 'awards', 'opening weekend', 'producers',
       'budget', 'script department', 'production companies', 'writers',
       'directors', 'casting directors', 'cast', 'countries', 'age restrict',
       'plot', 'plot outline', 'keywords', 'genres', 'taglines', 'synopsis'],
      dtype='object')
   imdbid                   title  \
0  120770  A Night at the Roxbury   
1  132512          At First Sight   
2  118661            The Avengers   
3  215545              Bamboozled   
4  118715        The Big Lebowski   

                                                akas  year  metascore  \
0  Une nuit au Roxbury (France), Movida en el Rox...  1998         26   
1  Sight Unseen (United States), Premier regard (...  1999         40   
2  Chapeau melon et bottes de cuir (France), Mit ...  1998         12   
3  The Very Black Show (France), It's Showtime (G...  2000         54   
4  El gr

The columns of interest include:

- Title
- Age restriction
- Year: This could be useful for analyzing shifts in cultural norms over time. For instance, certain curse words may have led to an MA rating in the 1960s but not in the 2020s.
- Budget and Opening weekend: These may be important for studying the effect of movie classification on its financial performance.
- IMDb ID: This might be relevant for linking additional data from the IMDb database.

In [27]:
# Extract movie titles and IDs in one step using list comprehension
movie_titles, ids = zip(*[(f.split("_", 1)[0], f.split("_", 1)[1].split(".", 1)[0]) for f in screenplays.keys()])

# Print the first 10 titles and IDs
for i, (title, id) in enumerate(zip(movie_titles, ids)):
    if i == 10:
        break
    print(f"Title: {title}  ID: {id}")

Title: 10 Cloverfield Lane  ID: 1179933
Title: 10 Things I Hate About You  ID: 0147800
Title: 101 Days of 101 Dalmatians  ID: 0249328
Title: 12 Angry Men  ID: 0118528
Title: 12 Monkeys  ID: 0114746
Title: 12 Years a Slave  ID: 2024544
Title: 127 Hours  ID: 1542344
Title: 13 13 13  ID: 2991516
Title: 1408  ID: 0450385
Title: 1492 Conquest of Paradise  ID: 0103594


# Dataset Preprocessing

In [28]:
# Create a DataFrame directly from screenplays, extracting ids from the keys
screenplays_df = pd.DataFrame({
    'imdbid': [os.path.splitext(f.split("_", 1)[1])[0].replace(".txt", "") for f in screenplays.keys()],
    'screenplay': screenplays.values()
})

screenplays_df.head()

Unnamed: 0,imdbid,screenplay
0,1179933,The Cellar\n\nby\nJosh Campbell & Matt Stuecke...
1,147800,\n TEN THINGS I ...
2,249328,"107\n#2\n40] _DALMATIANS MARCH 17, 1995\n\nEX..."
3,118528,PLEASE COPY AND RETURN |\n\nâââ_âââ...
4,114746,\n\t\t\t\tTWELVE MONKEYS\n\t \n\t\t ...


In [29]:
print(meta_df.info())
print(screenplays_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2858 entries, 0 to 2857
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   imdbid                     2858 non-null   int64 
 1   title                      2858 non-null   object
 2   akas                       2652 non-null   object
 3   year                       2858 non-null   int64 
 4   metascore                  2858 non-null   int64 
 5   imdb user rating           2858 non-null   int64 
 6   number of imdb user votes  2858 non-null   int64 
 7   awards                     2243 non-null   object
 8   opening weekend            1739 non-null   object
 9   producers                  2640 non-null   object
 10  budget                     1624 non-null   object
 11  script department          2220 non-null   object
 12  production companies       2682 non-null   object
 13  writers                    2696 non-null   object
 14  director

In [30]:
# Ensure 'imdbid' is of the correct type in both DataFrames before merging, if needed
if screenplays_df['imdbid'].dtype != 'int':
    screenplays_df['imdbid'] = screenplays_df['imdbid'].astype(int)

# Merge with metadata on 'imdbid'
df = meta_df.merge(screenplays_df, on='imdbid')

df.head()

Unnamed: 0,imdbid,title,akas,year,metascore,imdb user rating,number of imdb user votes,awards,opening weekend,producers,budget,script department,...,directors,casting directors,cast,countries,age restrict,plot,plot outline,keywords,genres,taglines,synopsis,screenplay
0,120770,A Night at the Roxbury,"Une nuit au Roxbury (France), Movida en el Rox...",1998,26,6,56537,,United States:,"Marie Cantin, Erin Fraser, Amy Heckerling, Ste...","$17,000,000 (estimated)",,...,John Fortenberry,Jeff Greenberg,"Will Ferrell, Chris Kattan, Raquel Gardner, Vi...",United States,"Argentina:13, Australia:M, Brazil:14, Canada:P...",Two dim-witted brothers dream of owning their ...,"The Roxbury Guys, Steve and Doug Butabi, want ...","woman-on-top, nightclub, car-accident, 1990s, ...","Comedy, Music, Romance",Score!,,\n\n\t\t\t A NIGHT AT THE ROXBURY \n\n\n\t\...
1,132512,At First Sight,"Sight Unseen (United States), Premier regard (...",1999,40,6,12922,,United States:,"Rob Cowan, Roger Paradiso, Irwin Winkler","$60,000,000 (estimated)",,...,Irwin Winkler,"Kerry Barden, Billy Hopkins, Suzanne Smith","Val Kilmer, Mira Sorvino, Kelly McGillis, Stev...",United States,"Argentina:13, Australia:M, Canada:PG::(Alberta...",A blind man has an operation to regain his sig...,First Sight is true to the title from start to...,"visual-agnosia, brother-sister-relationship, r...","Drama, Romance","Only Love Can Bring You To Your Senses., Scien...",,AT FIRST SIGHT\n\nEXT. VALLEY - DUSK \nGold li...
2,118661,The Avengers,"Chapeau melon et bottes de cuir (France), Mit ...",1998,12,3,40784,"FMCJ Award 1998, Golden Reel Award 1999, Razzi...","United States: $10,305,957, 16 Aug 1998","Susan Ekins, Jerry Weintraub","$60,000,000 (estimated)","Sharon Mansfield, Anna Worley",...,Jeremiah S. Chechik,Susie Figgis,"Ralph Fiennes, Uma Thurman, Sean Connery, Patr...",United States,"Argentina:13, Australia:PG, Brazil:10, Canada:...",Two British Agents team up to stop Sir August ...,"British Ministry Agent John Steed, under direc...","good-versus-evil, heroine, evil-man, villain, ...","Action, Adventure, Sci-Fi, Thriller","Mrs. Peel, we're needed., Extraordinary crimes...",,\n\n\t\t\t\t\tTHE AVENGERS\n\n\t\t\t\tScreenpl...
3,215545,Bamboozled,"The Very Black Show (France), It's Showtime (G...",2000,54,6,10373,"Golden Berlin Bear 2001, Black Reel 2001, Imag...",United States:,"Jon Kilik, Spike Lee, Kisha Imani Cameron","$10,000,000 (estimated)","Shari L. Carpenter, Carolyn De Sousa",...,Spike Lee,Aisha Coley,"Damon Wayans, Savion Glover, Jada Pinkett Smit...",United States,"Australia:MA, Finland:K-15, France:Tous public...",A frustrated African-American TV writer propos...,"Dark, biting satire of the television industry...","television-industry, african-american, referen...","Comedy, Drama, Music",Starring the great negroe actors,"In a New York City residence, Pierre Delacroix...",\t\t\t\tBamboozled\n\n\t\t\t\tby\n\n\t\t\t\tSp...
4,118715,The Big Lebowski,"El gran Lebowski (Spain), O Grande Lebowski (P...",1998,71,8,724388,"Honorable Mention 1998, ACCA 1998, Golden Berl...","United States: $5,533,844, 08 Mar 1998","Tim Bevan, John Cameron, Ethan Coen, Eric Fell...","$15,000,000 (estimated)",T. Kukovinski,...,"Joel Coen, Ethan Coen",John S. Lyons,"Jeff Bridges, John Goodman, Julianne Moore, St...","United States, United Kingdom","Argentina:16, Argentina:18::(cable rating), Au...","Jeff ""The Dude"" Lebowski, mistaken for a milli...","When ""the dude"" Lebowski is mistaken for a mil...","rug, nihilism, pornographer, bowling-alley, de...","Comedy, Crime, Sport",Hay quienes tratan de ganarse la vida sin move...,A tumbleweed rolls up a hillside just outside ...,\n\n\t\t\tTHE BIG LEBOWSKI\n\nWe are floating ...


In [31]:
# Create a lean version of the dataframe with only relevant columns for predicting age restrict classification
df_lean = df[['imdbid', 'title', 'year', 'opening weekend', 'budget', 'age restrict', 'genres', 'screenplay']]
df_lean.head()

Unnamed: 0,imdbid,title,year,opening weekend,budget,age restrict,genres,screenplay
0,120770,A Night at the Roxbury,1998,United States:,"$17,000,000 (estimated)","Argentina:13, Australia:M, Brazil:14, Canada:P...","Comedy, Music, Romance",\n\n\t\t\t A NIGHT AT THE ROXBURY \n\n\n\t\...
1,132512,At First Sight,1999,United States:,"$60,000,000 (estimated)","Argentina:13, Australia:M, Canada:PG::(Alberta...","Drama, Romance",AT FIRST SIGHT\n\nEXT. VALLEY - DUSK \nGold li...
2,118661,The Avengers,1998,"United States: $10,305,957, 16 Aug 1998","$60,000,000 (estimated)","Argentina:13, Australia:PG, Brazil:10, Canada:...","Action, Adventure, Sci-Fi, Thriller",\n\n\t\t\t\t\tTHE AVENGERS\n\n\t\t\t\tScreenpl...
3,215545,Bamboozled,2000,United States:,"$10,000,000 (estimated)","Australia:MA, Finland:K-15, France:Tous public...","Comedy, Drama, Music",\t\t\t\tBamboozled\n\n\t\t\t\tby\n\n\t\t\t\tSp...
4,118715,The Big Lebowski,1998,"United States: $5,533,844, 08 Mar 1998","$15,000,000 (estimated)","Argentina:16, Argentina:18::(cable rating), Au...","Comedy, Crime, Sport",\n\n\t\t\tTHE BIG LEBOWSKI\n\nWe are floating ...


In [32]:
df_lean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2853 entries, 0 to 2852
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   imdbid           2853 non-null   int64 
 1   title            2853 non-null   object
 2   year             2853 non-null   int64 
 3   opening weekend  1736 non-null   object
 4   budget           1623 non-null   object
 5   age restrict     2524 non-null   object
 6   genres           2841 non-null   object
 7   screenplay       2853 non-null   object
dtypes: int64(2), object(6)
memory usage: 178.4+ KB


In [33]:
# Assess missing values in 'age restrict' and print the shape in a single step
missing_values = df_lean['age restrict'].isnull().sum()
df_lean.shape, missing_values

((2853, 8), 329)

In [34]:
# Drop rows with missing values in 'age restrict' and check the shape in a single step
df_clean = df_lean.dropna(subset=['age restrict'])
df_clean.shape, df_clean['age restrict'].isnull().sum()

((2524, 8), 0)

In [35]:
import re
import pandas as pd

# Define the regex pattern for Australia
pattern = re.compile(r"Australia:([A-Za-z0-9-]+)")

# Function to find the Australia classification in a string
def find_aus_classification(string):
    if isinstance(string, str):  # Ensure the input is a string
        match = pattern.search(string)
        return match.group(1) if match else pd.NA  # Return the matched value or NA
    return pd.NA  # Return NA if the input is not a string

# Assuming df_clean is your DataFrame and 'age restrict' is the column of interest
aus_classifications = df_clean['age restrict'].apply(find_aus_classification)

# Output the classifications for Australia
aus_classifications

0          M
1          M
2         PG
3         MA
4         MA
        ... 
2848    MA15
2849      PG
2850    MA15
2851      PG
2852    MA15
Name: age restrict, Length: 2524, dtype: object

In [36]:
# Create a dataset with Australian classifications
df_clean['age restrict aus'] = aus_classifications
df_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['age restrict aus'] = aus_classifications


Unnamed: 0,imdbid,title,year,opening weekend,budget,age restrict,genres,screenplay,age restrict aus
0,120770,A Night at the Roxbury,1998,United States:,"$17,000,000 (estimated)","Argentina:13, Australia:M, Brazil:14, Canada:P...","Comedy, Music, Romance",\n\n\t\t\t A NIGHT AT THE ROXBURY \n\n\n\t\...,M
1,132512,At First Sight,1999,United States:,"$60,000,000 (estimated)","Argentina:13, Australia:M, Canada:PG::(Alberta...","Drama, Romance",AT FIRST SIGHT\n\nEXT. VALLEY - DUSK \nGold li...,M
2,118661,The Avengers,1998,"United States: $10,305,957, 16 Aug 1998","$60,000,000 (estimated)","Argentina:13, Australia:PG, Brazil:10, Canada:...","Action, Adventure, Sci-Fi, Thriller",\n\n\t\t\t\t\tTHE AVENGERS\n\n\t\t\t\tScreenpl...,PG
3,215545,Bamboozled,2000,United States:,"$10,000,000 (estimated)","Australia:MA, Finland:K-15, France:Tous public...","Comedy, Drama, Music",\t\t\t\tBamboozled\n\n\t\t\t\tby\n\n\t\t\t\tSp...,MA
4,118715,The Big Lebowski,1998,"United States: $5,533,844, 08 Mar 1998","$15,000,000 (estimated)","Argentina:16, Argentina:18::(cable rating), Au...","Comedy, Crime, Sport",\n\n\t\t\tTHE BIG LEBOWSKI\n\nWe are floating ...,MA


In [37]:
# Drop rows with missing values in 'age restrict aus' and output the shape
df_aus = df_clean.dropna(subset=['age restrict aus'])
df_aus.shape

(2302, 9)

M (Mature) - Assigned a value of 0, this rating indicates content that is recommended for audiences aged 15 and above. It may contain moderate themes, violence, or coarse language. Viewer discretion is advised for younger audiences.

PG (Parental Guidance) - With a value of 1, this rating suggests that some content may not be suitable for children without the guidance of a parent or guardian. Themes, language, and scenes are generally mild, but certain elements may require explanation or oversight.

MA (Mature Audience) - Rated 2, this classification is for mature viewers aged 15 years and above, and may include strong themes, violence, or explicit language. Parental guidance is strongly recommended for younger viewers.

G (General) - Rated 3, this classification means the content is suitable for all audiences. There are no themes, language, or visuals that would be inappropriate for children, making it family-friendly.

R (Restricted) - Assigned a value of 4, this rating is for adults only (18+). It includes content that may feature explicit language, violence, sexual content, or other strong themes that are not suitable for younger audiences.

MA15+ (Mature Audience 15+) - With the highest rating value of 5, this classification is specifically for people aged 15 and above. Content may include strong themes, violence, and other mature subject matter. Viewer discretion is required, and it is unsuitable for younger audiences.

# Modelling

In [38]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import RandomOverSampler

# Precompile regular expressions for efficiency
newline_pattern = re.compile(r'\n+')
tab_pattern = re.compile(r'\t+')
space_pattern = re.compile(r' +')
screenplay_formatting_pattern = re.compile(r'(FADE IN:|CUT TO:|EXT\.|INT\.|SUPERIMPOSE:)', re.IGNORECASE)
meta_data_pattern = re.compile(r'(written by|&|June \d{1,2}, \d{4})', re.IGNORECASE)
all_caps_pattern = re.compile(r'[A-Z]{2,}(?:\s+[A-Z]{2,})*')

# Function to clean the text
def clean_screenplay_text(text):
    # Remove form feed characters (\x0c)
    text = text.replace('\x0c', '')
    # Replace multiple newlines with a single newline
    text = newline_pattern.sub(' ', text)
    # Replace tabs with a single space
    text = tab_pattern.sub(' ', text)
    # Replace multiple spaces with a single space
    text = space_pattern.sub(' ', text)
    # Remove screenplay formatting (e.g., FADE IN:, CUT TO:, etc.)
    text = screenplay_formatting_pattern.sub('', text)
    # Remove metadata like author names or dates
    text = meta_data_pattern.sub('', text)
    # Remove all caps, typically used for scene descriptions
    text = all_caps_pattern.sub('', text)
    # Remove special characters that are not part of standard sentences
    text = re.sub(r'[^A-Za-z0-9\'\.\?\!,\s]', '', text)
    # Standardize single quotes
    text = re.sub(r'[‘’]', "'", text)
    # Standardize double quotes
    text = re.sub(r'[“”]', '"', text)
    # Remove leading/trailing spaces
    text = text.strip()
    return text

# Assuming df_aus is a DataFrame with "screenplay" and "age restrict aus" columns
print("Dropping rows with NaN in 'screenplay' or 'age restrict aus' columns...")
df_aus = df_aus.dropna(subset=['screenplay', 'age restrict aus'])  # Drop rows with NaNs

print(f"Cleaning {len(df_aus)} screenplays...")
df_aus['cleaned_screenplay'] = df_aus['screenplay'].apply(clean_screenplay_text)

# Preprocessing: Vectorize the screenplays
print("Vectorizing the cleaned screenplays...")
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X = tfidf_vectorizer.fit_transform(df_aus['cleaned_screenplay'])
print(f"Vectorization complete. Feature matrix shape: {X.shape}")

# Map the ratings to numerical values
rating_mapping = {'G': 0, 'PG': 1, 'M': 2, 'MA': 3, 'MA15': 4, 'R': 5}
print(f"Mapping age restriction labels to numerical values...")
df_aus['mapped_rating'] = df_aus['age restrict aus'].map(rating_mapping)

# Drop any remaining NaN values in 'mapped_rating'
print("Ensuring data consistency...")
df_aus = df_aus.dropna(subset=['mapped_rating'])

# Ensure X is aligned with df_aus by slicing X with the same length as df_aus
X = X[:len(df_aus)]
y = df_aus['mapped_rating']
print(f"Data cleaned. Total records after processing: {len(df_aus)}")

# Split the dataset before oversampling
print("Splitting data into training and testing sets (80/20 split)...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training data: {X_train.shape}, Test data: {X_test.shape}")

# Oversample the training data
print("Applying oversampling to balance the classes in the training data...")
oversampler = RandomOverSampler(random_state=42)
X_train, y_train = oversampler.fit_resample(X_train, y_train)
print(f"After oversampling, training data shape: {X_train.shape}")

# Train the classifier
print("Training Logistic Regression model...")
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
print("Model training complete.")

# Predict on the test set
print("Making predictions on the test set...")
y_pred = model.predict(X_test)

# Evaluation
print("Evaluating model performance...")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.4f}")
print("\nClassification Report:\n", classification_report(
    y_test, y_pred, target_names=list(rating_mapping.keys()))
)

# Function to predict the rating of new screenplays
def predict_movie_rating(screenplay):
    print(f"Predicting rating for a new screenplay: '{screenplay[:30]}...'")
    screenplay_cleaned = clean_screenplay_text(screenplay)  # Clean new screenplay
    screenplay_vector = tfidf_vectorizer.transform([screenplay_cleaned])
    rating_index = model.predict(screenplay_vector)[0]
    # Retrieve the rating label from the mapping
    return {v: k for k, v in rating_mapping.items()}[rating_index]

# Example usage: Provide a new screenplay text (instead of passing a DataFrame column)
new_screenplay_text, actual_rating = df_aus[["screenplay", "age restrict aus"]].iloc[0]
predicted_rating = predict_movie_rating(new_screenplay_text)
print(f"Predicted Rating: {predicted_rating}")
print(f"Actual Rating: {actual_rating}")

Dropping rows with NaN in 'screenplay' or 'age restrict aus' columns...
Cleaning 2302 screenplays...
Vectorizing the cleaned screenplays...
Vectorization complete. Feature matrix shape: (2302, 5000)
Mapping age restriction labels to numerical values...
Ensuring data consistency...
Data cleaned. Total records after processing: 2199
Splitting data into training and testing sets (80/20 split)...
Training data: (1759, 5000), Test data: (440, 5000)
Applying oversampling to balance the classes in the training data...
After oversampling, training data shape: (4800, 5000)
Training Logistic Regression model...
Model training complete.
Making predictions on the test set...
Evaluating model performance...
Accuracy: 0.2341

Classification Report:
               precision    recall  f1-score   support

           G       0.05      0.09      0.06        22
          PG       0.16      0.18      0.17        71
           M       0.45      0.30      0.36       200
          MA       0.10      0.13    

In [39]:
# Custom accuracy function that considers predictions correct if the difference is at most 1
def custom_accuracy(y_true, y_pred):
    correct = sum(abs(y_t - y_p) <= 1 for y_t, y_p in zip(y_true, y_pred))
    return correct / len(y_true)

# Custom precision, recall, and F1-score functions for relaxed criteria
def custom_f1_score(y_true, y_pred, label):
    # For each label, compute custom precision, recall, and F1 score

    # True positives: predictions within 1 of the actual value
    tp = sum(abs(y_t - y_p) <= 1 and y_t == label for y_t, y_p in zip(y_true, y_pred))
    
    # False positives: predicted as this label but the actual value is more than 1 away
    fp = sum(abs(y_t - y_p) > 1 and y_p == label for y_t, y_p in zip(y_true, y_pred))
    
    # False negatives: actual value is this label, but the predicted value is more than 1 away
    fn = sum(abs(y_t - y_p) > 1 and y_t == label for y_t, y_p in zip(y_true, y_pred))
    
    # Compute precision, recall, and F1 score
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1

# Evaluating model performance
print("Evaluating model performance...")

# Use custom accuracy instead of default accuracy_score
accuracy = custom_accuracy(y_test, y_pred)
print(f"Custom Accuracy: {accuracy:.4f}")

# Get unique labels
unique_labels = np.unique(y_test)

# Evaluating model performance
print("Evaluating model performance...")

# Calculate and print custom F1 score for each label
for label in unique_labels:
    precision, recall, f1 = custom_f1_score(y_test, y_pred, label)
    print(f"Label: {label}")
    print(f"  Custom Precision: {precision:.4f}")
    print(f"  Custom Recall: {recall:.4f}")
    print(f"  Custom F1-Score: {f1:.4f}\n")

# Classification report for comparison (using the original metric)
print("\nClassification Report:\n", classification_report(
    y_test, y_pred, target_names=list(rating_mapping.keys()))
)

Evaluating model performance...
Custom Accuracy: 0.5068
Evaluating model performance...
Label: 0.0
  Custom Precision: 0.1190
  Custom Recall: 0.2273
  Custom F1-Score: 0.1562

Label: 1.0
  Custom Precision: 0.5692
  Custom Recall: 0.5211
  Custom F1-Score: 0.5441

Label: 2.0
  Custom Precision: 0.7333
  Custom Recall: 0.6050
  Custom F1-Score: 0.6630

Label: 3.0
  Custom Precision: 0.6250
  Custom Recall: 0.6410
  Custom F1-Score: 0.6329

Label: 4.0
  Custom Precision: 0.3043
  Custom Recall: 0.3415
  Custom F1-Score: 0.3218

Label: 5.0
  Custom Precision: 0.1944
  Custom Recall: 0.2692
  Custom F1-Score: 0.2258


Classification Report:
               precision    recall  f1-score   support

           G       0.05      0.09      0.06        22
          PG       0.16      0.18      0.17        71
           M       0.45      0.30      0.36       200
          MA       0.10      0.13      0.11        39
        MA15       0.18      0.20      0.19        82
           R       0.15     

In [40]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import RandomOverSampler
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from sklearn.metrics import classification_report
import numpy as np

# Precompile regular expressions for efficiency
newline_pattern = re.compile(r'\n+')
tab_pattern = re.compile(r'\t+')
space_pattern = re.compile(r' +')
screenplay_formatting_pattern = re.compile(r'(FADE IN:|CUT TO:|EXT\.|INT\.|SUPERIMPOSE:)', re.IGNORECASE)
meta_data_pattern = re.compile(r'(written by|&|June \d{1,2}, \d{4})', re.IGNORECASE)
all_caps_pattern = re.compile(r'[A-Z]{2,}(?:\s+[A-Z]{2,})*')

# Function to clean the text
def clean_screenplay_text(text):
    # Remove form feed characters (\x0c)
    text = text.replace('\x0c', '')
    # Replace multiple newlines with a single newline
    text = newline_pattern.sub(' ', text)
    # Replace tabs with a single space
    text = tab_pattern.sub(' ', text)
    # Replace multiple spaces with a single space
    text = space_pattern.sub(' ', text)
    # Remove screenplay formatting (e.g., FADE IN:, CUT TO:, etc.)
    text = screenplay_formatting_pattern.sub('', text)
    # Remove metadata like author names or dates
    text = meta_data_pattern.sub('', text)
    # Remove all caps, typically used for scene descriptions
    text = all_caps_pattern.sub('', text)
    # Remove special characters that are not part of standard sentences
    text = re.sub(r'[^A-Za-z0-9\'\.\?\!,\s]', '', text)
    # Standardize single quotes
    text = re.sub(r'[‘’]', "'", text)
    # Standardize double quotes
    text = re.sub(r'[“”]', '"', text)
    # Remove leading/trailing spaces
    text = text.strip()
    return text

# Preprocess screenplays and ratings
print("Cleaning screenplay texts...")
df_aus['cleaned_screenplay'] = df_aus['screenplay'].apply(clean_screenplay_text)
print(f"Sample cleaned screenplay: {df_aus['cleaned_screenplay'].iloc[0]}")

# Map ratings to numerical values
print("Mapping age restrictions to numerical values...")
rating_mapping = {'G': 0, 'PG': 1, 'M': 2, 'MA': 3, 'MA15': 4, 'R': 5}
df_aus['mapped_rating'] = df_aus['age restrict aus'].map(rating_mapping)
print(f"Sample mapped rating: {df_aus['mapped_rating'].iloc[0]}")

# Drop rows with NaNs
print("Dropping rows with missing values...")
df_aus.dropna(subset=['cleaned_screenplay', 'mapped_rating'], inplace=True)
print(f"Data after dropping NaNs: {len(df_aus)} rows")

# Split data into features (X) and labels (y)
X = df_aus['cleaned_screenplay']
y = df_aus['mapped_rating']

# Split the data into training and test sets
print("Splitting data into training and test sets...")
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)
print(f"Training set size: {len(X_train)}, Test set size: {len(X_test)}")

# Apply RandomOverSampler to the training set
print("Applying RandomOverSampler to balance the training set...")
oversampler = RandomOverSampler(random_state=42)
X_train_resampled, y_train_resampled = oversampler.fit_resample(X_train.values.reshape(-1, 1), y_train)

# Flatten the resampled data back to a Series
X_train_resampled = pd.Series(X_train_resampled.flatten())

# Tokenizer and model initialization
print("Loading BERT tokenizer and model...")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(rating_mapping))

# Tokenize data
print("Tokenizing data...")
def tokenize_function(texts):
    return tokenizer(texts, padding="max_length", truncation=True, max_length=128)

# Convert data to Hugging Face Dataset format
print("Converting data to Dataset format...")
train_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_train_resampled, 'label': y_train_resampled}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'text': X_test, 'label': y_test}))

print("Tokenizing training and test datasets...")
train_dataset = train_dataset.map(lambda e: tokenize_function(e['text']), batched=True)
test_dataset = test_dataset.map(lambda e: tokenize_function(e['text']), batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
print("Tokenization complete.")

# Training arguments
print("Setting up training arguments...")
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)
print("Training arguments set.")

# Define Trainer
print("Initializing Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda p: {'accuracy': accuracy_score(p.label_ids, p.predictions.argmax(-1))},
)
print("Trainer initialized.")

# Train and evaluate the model
print("Training the model...")
trainer.train()
print("Training complete.")

# Evaluate the model
print("Evaluating the model...")
metrics = trainer.evaluate()
print(f"Evaluation complete. Accuracy: {metrics['eval_accuracy']:.4f}")

# Get predictions from the test dataset
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

# Print classification report
report = classification_report(predictions.label_ids, pred_labels, target_names=[k for k, v in rating_mapping.items()])
print("Evaluation complete. Classification Report:")
print(report)

# Function to predict rating
def predict_movie_rating(screenplay):
    screenplay_cleaned = clean_screenplay_text(screenplay)
    inputs = tokenizer(screenplay_cleaned, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    predicted_class_id = outputs.logits.argmax().item()
    return {v: k for k, v in rating_mapping.items()}[predicted_class_id]

# Example usage
print("Predicting rating for a new screenplay...")
new_screenplay_text, actual_rating = df_aus[["screenplay", "age restrict aus"]].iloc[0]
predicted_rating = predict_movie_rating(new_screenplay_text)
print(f"Predicted Rating: {predicted_rating}")
print(f"Actual Rating: {actual_rating}")

Cleaning screenplay texts...
Sample cleaned screenplay: A   Steve Koren Will Ferrell  Chris Kattan       As we hear What is Love by   night falls and partytime begins.  ., 1103    Coconut Teaser, The Palace, The Roxbury, Tatou, etc.       Of random dancers  gyrating, flirting, making out, drinking.      The  a crowded dance floor  and  the rhythmically swaying backs of...  Our heroes. In their minds, Steve is tall, dark and handsome and  is a little genius. Neither is correct  except for the tall and little part. They simultaneously turn and scope the room. In unison, their heads bop to the . Doug steps out from the bar.  to O.S. female Hey! You want to dance? No? Yes? Alright, don't worry about it. Doug, rejected, steps back as Steve steps out.  to O.S. female Do you want to dance? You do? You don't? Not a problem. They are no strangers to rejection, so neither is fazed. Doug enthusiastically steps towards two attractive girls.  Hey, you wanna...? Two attractive girls turn their backs

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing data...
Converting data to Dataset format...
Tokenizing training and test datasets...


Map:   0%|          | 0/4800 [00:00<?, ? examples/s]

Map:   0%|          | 0/440 [00:00<?, ? examples/s]

Tokenization complete.
Setting up training arguments...
Training arguments set.
Initializing Trainer...




Trainer initialized.
Training the model...


  0%|          | 0/1800 [00:00<?, ?it/s]

{'loss': 1.8159, 'grad_norm': 7.086045265197754, 'learning_rate': 4.972222222222223e-05, 'epoch': 0.02}
{'loss': 1.8148, 'grad_norm': 5.063703536987305, 'learning_rate': 4.9444444444444446e-05, 'epoch': 0.03}
{'loss': 1.7946, 'grad_norm': 4.466113090515137, 'learning_rate': 4.9166666666666665e-05, 'epoch': 0.05}
{'loss': 1.745, 'grad_norm': 7.382280349731445, 'learning_rate': 4.888888888888889e-05, 'epoch': 0.07}
{'loss': 1.7467, 'grad_norm': 9.043924331665039, 'learning_rate': 4.8611111111111115e-05, 'epoch': 0.08}
{'loss': 1.8292, 'grad_norm': 6.043708324432373, 'learning_rate': 4.8333333333333334e-05, 'epoch': 0.1}
{'loss': 1.8196, 'grad_norm': 5.254968166351318, 'learning_rate': 4.805555555555556e-05, 'epoch': 0.12}
{'loss': 1.8047, 'grad_norm': 5.91767692565918, 'learning_rate': 4.7777777777777784e-05, 'epoch': 0.13}
{'loss': 1.7547, 'grad_norm': 6.762462139129639, 'learning_rate': 4.75e-05, 'epoch': 0.15}
{'loss': 1.7604, 'grad_norm': 4.858633518218994, 'learning_rate': 4.7222222

  0%|          | 0/55 [00:00<?, ?it/s]

{'eval_loss': 1.779884934425354, 'eval_accuracy': 0.32954545454545453, 'eval_runtime': 57.9143, 'eval_samples_per_second': 7.597, 'eval_steps_per_second': 0.95, 'epoch': 1.0}
{'loss': 0.5529, 'grad_norm': 35.10824966430664, 'learning_rate': 3.3055555555555553e-05, 'epoch': 1.02}
{'loss': 0.4018, 'grad_norm': 23.738290786743164, 'learning_rate': 3.277777777777778e-05, 'epoch': 1.03}
{'loss': 0.5752, 'grad_norm': 7.043961524963379, 'learning_rate': 3.2500000000000004e-05, 'epoch': 1.05}
{'loss': 0.4186, 'grad_norm': 8.90108585357666, 'learning_rate': 3.222222222222223e-05, 'epoch': 1.07}
{'loss': 0.5393, 'grad_norm': 15.748461723327637, 'learning_rate': 3.194444444444444e-05, 'epoch': 1.08}
{'loss': 0.3127, 'grad_norm': 16.179500579833984, 'learning_rate': 3.1666666666666666e-05, 'epoch': 1.1}
{'loss': 0.5643, 'grad_norm': 5.622227191925049, 'learning_rate': 3.138888888888889e-05, 'epoch': 1.12}
{'loss': 0.6156, 'grad_norm': 10.037485122680664, 'learning_rate': 3.111111111111111e-05, 'ep

  0%|          | 0/55 [00:00<?, ?it/s]

{'eval_loss': 2.145724296569824, 'eval_accuracy': 0.46136363636363636, 'eval_runtime': 56.6162, 'eval_samples_per_second': 7.772, 'eval_steps_per_second': 0.971, 'epoch': 2.0}
{'loss': 0.1027, 'grad_norm': 0.7442823052406311, 'learning_rate': 1.638888888888889e-05, 'epoch': 2.02}
{'loss': 0.2118, 'grad_norm': 37.619224548339844, 'learning_rate': 1.6111111111111115e-05, 'epoch': 2.03}
{'loss': 0.132, 'grad_norm': 2.0709500312805176, 'learning_rate': 1.5833333333333333e-05, 'epoch': 2.05}
{'loss': 0.1408, 'grad_norm': 4.396870136260986, 'learning_rate': 1.5555555555555555e-05, 'epoch': 2.07}
{'loss': 0.2425, 'grad_norm': 12.082918167114258, 'learning_rate': 1.527777777777778e-05, 'epoch': 2.08}
{'loss': 0.104, 'grad_norm': 0.42061176896095276, 'learning_rate': 1.5e-05, 'epoch': 2.1}
{'loss': 0.1666, 'grad_norm': 0.42080405354499817, 'learning_rate': 1.4722222222222224e-05, 'epoch': 2.12}
{'loss': 0.1509, 'grad_norm': 21.34931755065918, 'learning_rate': 1.4444444444444444e-05, 'epoch': 2.

  0%|          | 0/55 [00:00<?, ?it/s]

{'eval_loss': 3.0255963802337646, 'eval_accuracy': 0.46136363636363636, 'eval_runtime': 57.104, 'eval_samples_per_second': 7.705, 'eval_steps_per_second': 0.963, 'epoch': 3.0}
{'train_runtime': 6780.3118, 'train_samples_per_second': 2.124, 'train_steps_per_second': 0.265, 'train_loss': 0.567479533818033, 'epoch': 3.0}
Training complete.
Evaluating the model...


  0%|          | 0/55 [00:00<?, ?it/s]

Evaluation complete. Accuracy: 0.4614


  0%|          | 0/55 [00:00<?, ?it/s]

Evaluation complete. Classification Report:
              precision    recall  f1-score   support

           G       0.43      0.14      0.21        22
          PG       0.44      0.28      0.34        71
           M       0.49      0.81      0.61       200
          MA       0.20      0.05      0.08        39
        MA15       0.41      0.17      0.24        82
           R       0.13      0.08      0.10        26

    accuracy                           0.46       440
   macro avg       0.35      0.25      0.26       440
weighted avg       0.42      0.46      0.40       440

Predicting rating for a new screenplay...
Predicted Rating: M
Actual Rating: M


In [41]:
# Custom accuracy function that considers predictions correct if the difference is at most 1
def custom_accuracy(y_true, y_pred):
    correct = sum(abs(y_t - y_p) <= 1 for y_t, y_p in zip(y_true, y_pred))
    return correct / len(y_true)

# Custom precision, recall, and F1-score functions for relaxed criteria
def custom_f1_score(y_true, y_pred, label):
    # True positives: predictions within 1 of the actual value
    tp = sum(abs(y_t - y_p) <= 1 and y_t == label for y_t, y_p in zip(y_true, y_pred))
    
    # False positives: predicted as this label but the actual value is more than 1 away
    fp = sum(abs(y_t - y_p) > 1 and y_p == label for y_t, y_p in zip(y_true, y_pred))
    
    # False negatives: actual value is this label, but the predicted value is more than 1 away
    fn = sum(abs(y_t - y_p) > 1 and y_t == label for y_t, y_p in zip(y_true, y_pred))
    
    # Compute precision, recall, and F1 score
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1

# Get predictions from the test dataset
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_test = predictions.label_ids

# Evaluating model performance
print("Evaluating model performance...")

# Use custom accuracy instead of default accuracy_score
accuracy = custom_accuracy(y_test, y_pred)
print(f"Custom Accuracy: {accuracy:.4f}")

# Get unique labels
unique_labels = np.unique(y_test)

# Calculate and print custom F1 score for each label
for label in unique_labels:
    precision, recall, f1 = custom_f1_score(y_test, y_pred, label)
    print(f"Label: {label}")
    print(f"  Custom Precision: {precision:.4f}")
    print(f"  Custom Recall: {recall:.4f}")
    print(f"  Custom F1-Score: {f1:.4f}\n")

# Classification report for comparison (using the original metric)
print("\nClassification Report:\n", classification_report(
    y_test, y_pred, target_names=list(rating_mapping.keys()))
)

# Function to predict rating with the updated model
def predict_movie_rating(screenplay):
    screenplay_cleaned = clean_screenplay_text(screenplay)
    inputs = tokenizer(screenplay_cleaned, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    predicted_class_id = outputs.logits.argmax().item()
    return {v: k for k, v in rating_mapping.items()}[predicted_class_id]

# Example usage
print("Predicting rating for a new screenplay...")
new_screenplay_text, actual_rating = df_aus[["screenplay", "age restrict aus"]].iloc[0]
predicted_rating = predict_movie_rating(new_screenplay_text)
print(f"Predicted Rating: {predicted_rating}")
print(f"Actual Rating: {actual_rating}")

  0%|          | 0/55 [00:00<?, ?it/s]

Evaluating model performance...
Custom Accuracy: 0.7045
Label: 0
  Custom Precision: 0.8182
  Custom Recall: 0.4091
  Custom F1-Score: 0.5455

Label: 1
  Custom Precision: 0.9067
  Custom Recall: 0.9577
  Custom F1-Score: 0.9315

Label: 2
  Custom Precision: 0.6582
  Custom Recall: 0.9050
  Custom F1-Score: 0.7621

Label: 3
  Custom Precision: 0.9714
  Custom Recall: 0.8718
  Custom F1-Score: 0.9189

Label: 4
  Custom Precision: 0.5172
  Custom Recall: 0.1829
  Custom F1-Score: 0.2703

Label: 5
  Custom Precision: 0.2000
  Custom Recall: 0.1154
  Custom F1-Score: 0.1463


Classification Report:
               precision    recall  f1-score   support

           G       0.43      0.14      0.21        22
          PG       0.44      0.28      0.34        71
           M       0.49      0.81      0.61       200
          MA       0.20      0.05      0.08        39
        MA15       0.41      0.17      0.24        82
           R       0.13      0.08      0.10        26

    accuracy     

In [42]:
# Save the model
output_dir = './saved_model'
print(f"Saving model to {output_dir}...")

# Save model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

print("Model and tokenizer saved successfully.")

Saving model to ./saved_model...
Model and tokenizer saved successfully.


In [43]:
# Function to predict rating
def predict_movie_rating(screenplay):
    screenplay_cleaned = clean_screenplay_text(screenplay)
    inputs = tokenizer(screenplay_cleaned, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    predicted_class_id = outputs.logits.argmax().item()
    return {v: k for k, v in rating_mapping.items()}[predicted_class_id]

# Example usage
print("Predicting rating for a new screenplay...")
new_screenplay_text, actual_rating = df_aus[["screenplay", "age restrict aus"]].iloc[0]
predicted_rating = predict_movie_rating(new_screenplay_text)
print(f"Predicted Rating: {predicted_rating}")
print(f"Actual Rating: {actual_rating}")

Predicting rating for a new screenplay...
Predicted Rating: M
Actual Rating: M
