# Importing Dependences

In [1]:
!pip install gdown



In [2]:
import pandas as pd
import numpy as np 
import os
import gdown
import zipfile
import shutil

# Loading Dataset

In [None]:
texts_path = "/archive/screenplay_data/data/raw_texts/raw_texts"

In [5]:
def extract_zip(zip_file_path, extract_to):
    # Extract the zip file if it exists and is valid
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
            print(f"Extracted all files to {extract_to}")
    except FileNotFoundError:
        print(f"Error: File {zip_file_path} does not exist.")
    except zipfile.BadZipFile:
        print("Error: The file is not a valid zip file.")
    except Exception as e:
        print(f"An error occurred: {e}")

def download_and_extract(gdrive_url, output_path, extract_to_folder, output_folder_path):
    # Download the file
    try:
        gdown.download(gdrive_url, output_path, quiet=False)
        print(f"File downloaded and saved as {output_path}")
        
        # Extract the downloaded zip file
        extract_zip(output_path, extract_to_folder)

        # Move the extracted folder and clean up
        shutil.move(extract_to_folder, output_folder_path)
        os.remove(output_path)
        print(f"Moved extracted files to {output_folder_path} and removed {output_path}")
    except Exception as e:
        print(f"An error occurred during the download or extraction process: {e}")

gdrive_url = 'https://drive.google.com/uc?id=1jmSta-CD03w341lOzNjT_1qAeNDt2zL2'
output_path = 'archive.zip'
output_folder_path = "archive"
extract_to_folder = "c:\\temp_extract"

download_and_extract(gdrive_url, output_path, extract_to_folder, output_folder_path)

Downloading...
From (original): https://drive.google.com/uc?id=1jmSta-CD03w341lOzNjT_1qAeNDt2zL2
From (redirected): https://drive.google.com/uc?id=1jmSta-CD03w341lOzNjT_1qAeNDt2zL2&confirm=t&uuid=844d5c90-bd8d-466b-8742-334fd79bfad8
To: c:\Users\syahr\OneDrive\Desktop\Portfolio\36118_NLP_Spring\archive.zip
100%|██████████| 689M/689M [00:46<00:00, 14.7MB/s] 


File downloaded and saved as archive.zip


# Data Exploration

In [None]:
import os

# Initialize dictionary to store file names and contents
screenplays = {}

# List and iterate over all files in the folder
for file_name in os.listdir(output_folder_path):
    file_path = os.path.join(output_folder_path, file_name)
    
    # Ensure the path is an actual file before reading
    if os.path.isfile(file_path):
        try:
            # Read and store file content
            with open(file_path, 'r', encoding='latin-1') as f:
                screenplays[file_name] = f.read()
        except Exception as e:
            print(f"Error reading {file_name}: {e}")

# Print a sample of the first ten files to check
for i, (file_name, content) in enumerate(screenplays.items()):
    if i == 10:
        break
    print(f"Example of {file_name}:\n")
    print(content[:100])  # Print the first 100 characters as a sample
    print("-" * 50)

In [None]:
# Set display option for better visibility
pd.set_option('display.max_columns', 25)

# Define the path to the metadata CSV file
csv_path = os.path.join(output_folder_path, 'movie_metadata', 'movie_meta_data.csv')

# Ensure the file exists before trying to read
if os.path.exists(csv_path):
    # Read the CSV file
    meta_df = pd.read_csv(csv_path)
    
    # Print the column names
    print(meta_df.columns)
    
    # Display the first few rows of the dataframe
    print(meta_df.head())
else:
    print(f"File {csv_path} does not exist.")

Columns relevant to us are:
- title
- age restrict
- year may be of relevance in examining changes in cultural norms over time. E.g. a certain curse word might get a movie an MA rating in the 1960s but not in the 2020s. 
- budget and opening weekend may be of relevance in examining impact of classification on the movie's net.
- imdbid may be of relevance for joining other data through the imdb database. 

In [None]:
print(list(screenplays.keys())[:10])

In [None]:
# filenames are formatted as movietitle_IMDBid 
import re

filenames = list(screenplays.keys())
movie_titles = []
ids = []
for f in filenames:
    # split at _ to separate title from rest of filename
    split1 = f.split(sep="_", maxsplit=1)
    movie_title = split1[0]
    movie_titles.append(movie_title)
    # split at "." to truncate file extension
    split2 = split1[1].split(sep=".", maxsplit=1)
    id = split2[0]
    ids.append(id)
i = 0
for title, id in zip(movie_titles, ids):
    if i == 10:
        break
    else:
        print("Title:", title, " ID:", id)
        i += 1

In [None]:
import re

# Extract movie titles and IDs in one step using list comprehension
movie_titles, ids = zip(*[(f.split("_", 1)[0], f.split("_", 1)[1].split(".", 1)[0]) for f in screenplays.keys()])

# Print the first 10 titles and IDs
for i, (title, id) in enumerate(zip(movie_titles, ids)):
    if i == 10:
        break
    print(f"Title: {title}  ID: {id}")

In [None]:
# create a DataFrame from ids and text data
screenplays_df = pd.DataFrame({
    'imdbid': ids,
    'screenplay': screenplays.values()
})
screenplays_df.head()

In [None]:
print(meta_df.info())
print(screenplays_df.info())

In [None]:
# merge with metadata on imdbid
screenplays_df['imdbid'] = screenplays_df['imdbid'].astype(int)
df = meta_df.merge(screenplays_df, on='imdbid')
df.head()

In [None]:
df.columns

In [None]:
# create a lean version of the dataframe containing only columns clearly relevant to predicting age restrict classification
relevant_cols = ['imdbid', 'title', 'year', 'opening weekend', 'budget', 'age restrict', 'genres', 'screenplay']
df_lean = df[relevant_cols]
df_lean.head()

In [None]:
df_lean.info()

In [None]:
df_lean['age restrict'][:50]

In [134]:
# filter out the Australian age restrict classification 
def find_aus_classification(string):
    pattern = re.compile(r', Australia:(G|PG|M|MA|MA15\+|R), ')
    match = re.search(pattern, string)
    if match:
        return match.group(1)
    else:
        return pd.NA

In [None]:
# assess missing values in age restrict 
df_lean['age restrict'].isnull().sum()
print(df_lean.shape)

In [None]:
# drop rows with missing values for age restrict 
df_clean = df_lean.dropna(how='any', subset='age restrict')
print(df_clean.shape)
print(df_clean['age restrict'].isnull().sum())

In [None]:
aus_classifications = df_clean['age restrict'].apply(find_aus_classification)
aus_classifications

In [None]:
# create a dataset with aus classifications
df_clean.loc[:,'age restrict aus'] = aus_classifications
df_clean.head()

In [None]:
df_clean.columns

In [None]:
df_aus = df_clean.dropna(how='any', subset='age restrict aus')
df_aus.shape

In [151]:
## save as CSVs
# df_clean.to_csv('df_clean.csv')
# df_aus.to_csv('df_aus.csv')