# Importing Dependences

In [1]:
!pip install gdown



In [2]:
import pandas as pd
import numpy as np 
import os
import gdown
import zipfile
import shutil

In [3]:
# replace paths here
root_path = os.getcwd()
texts_path = "\archive\screenplay_data\data\raw_texts\raw_texts"

# Google Drive shared file URL
gdrive_url = 'https://drive.google.com/uc?id=1jmSta-CD03w341lOzNjT_1qAeNDt2zL2'

  texts_path = "\archive\screenplay_data\data\raw_texts\raw_texts"


# Loading Dataset

In [4]:
def extract_zip(zip_file_path, extract_to):
    # Check if the zip file exists
    if not os.path.exists(zip_file_path):
        print(f"File {zip_file_path} does not exist.")
        return

    # Extract the zip file
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
            print(f"Extracted all files to {extract_to}")
    except zipfile.BadZipFile:
        print("Error: The file is not a valid zip file.")

output_path = 'archive.zip'
extract_to_folder = "c:\\archive"
folder_path = "archive"

# Download the file
gdown.download(gdrive_url, output_path, quiet=False)
print(f"File downloaded and saved as {output_path}")

extract_zip(output_path, extract_to_folder)
shutil.move(extract_to_folder, folder_path)
shutil.rmtree(extract_to_folder)
os.remove(output_path)

Downloading...
From (original): https://drive.google.com/uc?id=1jmSta-CD03w341lOzNjT_1qAeNDt2zL2
From (redirected): https://drive.google.com/uc?id=1jmSta-CD03w341lOzNjT_1qAeNDt2zL2&confirm=t&uuid=9ad155a0-ef2e-4b10-85bf-7c9047cf4649
To: c:\Users\syahr\OneDrive\Desktop\Portfolio\36118_NLP_Spring\archive.zip
100%|██████████| 689M/689M [00:44<00:00, 15.6MB/s] 


File downloaded and saved as archive.zip


FileNotFoundError: [Errno 2] No such file or directory: 'c:archive\\movie_characters\\data\\movie_character_texts\\movie_character_texts\\Beavis and Butt Head Do America_0115641\\Ranger at Old Faithful White House Press Secretary Lieutenant at Strategic Air Command_text.txt'

In [None]:
# read whole folder into a dictionary
folder_path = f'{root_path}{texts_path}'
screenplays = {}
# list all files in folder and iterate over them 
for file_name in os.listdir(folder_path):
    # get file_path by joining folder path with file_name
    file_path = os.path.join(folder_path, file_name)
    # ensure path points to an actual file
    if os.path.isfile(file_path):
        with open(file_path, 'r', encoding='latin-1') as f:
            # try:
            content = f.read()
            screenplays[file_name] = content
            # except:
            #     print(f"{file_name} could not be read.")
# ensure files were imported correctly by printing a sample of the first ten files 
i = 0
for file_name, content in screenplays.items():
    if i == 10:
        break
    else:
        print(f"Example of {file_name}:\n")
        print(content[:100])
        print("-"*50)
        i += 1

In [None]:
# import metadata csv
pd.set_option('display.max_columns', 25)
meta_df = pd.read_csv(f'{root_path}\\movie_metadata\\movie_meta_data.csv')
meta_df.head()

In [None]:
print(meta_df.columns)

Columns relevant to us are:
- title
- age restrict
- year may be of relevance in examining changes in cultural norms over time. E.g. a certain curse word might get a movie an MA rating in the 1960s but not in the 2020s. 
- budget and opening weekend may be of relevance in examining impact of classification on the movie's net.
- imdbid may be of relevance for joining other data through the imdb database. 

In [None]:
print(list(screenplays.keys())[:10])

In [None]:
# filenames are formatted as movietitle_IMDBid 
import re

filenames = list(screenplays.keys())
movie_titles = []
ids = []
for f in filenames:
    # split at _ to separate title from rest of filename
    split1 = f.split(sep="_", maxsplit=1)
    movie_title = split1[0]
    movie_titles.append(movie_title)
    # split at "." to truncate file extension
    split2 = split1[1].split(sep=".", maxsplit=1)
    id = split2[0]
    ids.append(id)
i = 0
for title, id in zip(movie_titles, ids):
    if i == 10:
        break
    else:
        print("Title:", title, " ID:", id)
        i += 1

In [None]:
# create a DataFrame from ids and text data
screenplays_df = pd.DataFrame({
    'imdbid': ids,
    'screenplay': screenplays.values()
})
screenplays_df.head()

In [None]:
print(meta_df.info())
print(screenplays_df.info())

In [None]:
# merge with metadata on imdbid
screenplays_df['imdbid'] = screenplays_df['imdbid'].astype(int)
df = meta_df.merge(screenplays_df, on='imdbid')
df.head()

In [None]:
df.columns

In [None]:
# create a lean version of the dataframe containing only columns clearly relevant to predicting age restrict classification
relevant_cols = ['imdbid', 'title', 'year', 'opening weekend', 'budget', 'age restrict', 'genres', 'screenplay']
df_lean = df[relevant_cols]
df_lean.head()

In [None]:
df_lean.info()

In [None]:
df_lean['age restrict'][:50]

In [134]:
# filter out the Australian age restrict classification 
def find_aus_classification(string):
    pattern = re.compile(r', Australia:(G|PG|M|MA|MA15\+|R), ')
    match = re.search(pattern, string)
    if match:
        return match.group(1)
    else:
        return pd.NA

In [None]:
# assess missing values in age restrict 
df_lean['age restrict'].isnull().sum()
print(df_lean.shape)

In [None]:
# drop rows with missing values for age restrict 
df_clean = df_lean.dropna(how='any', subset='age restrict')
print(df_clean.shape)
print(df_clean['age restrict'].isnull().sum())

In [None]:
aus_classifications = df_clean['age restrict'].apply(find_aus_classification)
aus_classifications

In [None]:
# create a dataset with aus classifications
df_clean.loc[:,'age restrict aus'] = aus_classifications
df_clean.head()

In [None]:
df_clean.columns

In [None]:
df_aus = df_clean.dropna(how='any', subset='age restrict aus')
df_aus.shape

In [151]:
## save as CSVs
# df_clean.to_csv('df_clean.csv')
# df_aus.to_csv('df_aus.csv')