# Importing Dependences

In [None]:
!pip install gdown

In [8]:
import pandas as pd
import numpy as np 
import os
import gdown
import zipfile
import shutil
import re

In [3]:
gdrive_url = 'https://drive.google.com/uc?id=1jmSta-CD03w341lOzNjT_1qAeNDt2zL2'
output_path = 'archive.zip'
output_folder_path = "archive"
extract_to_folder = "c:\\temp_extract"

# Loading Dataset

In [None]:
def extract_zip(zip_file_path, extract_to):
    # Extract the zip file if it exists and is valid
    try:
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
            print(f"Extracted all files to {extract_to}")
    except FileNotFoundError:
        print(f"Error: File {zip_file_path} does not exist.")
    except zipfile.BadZipFile:
        print("Error: The file is not a valid zip file.")
    except Exception as e:
        print(f"An error occurred: {e}")

def download_and_extract(gdrive_url, output_path, extract_to_folder, output_folder_path):
    # Download the file
    try:
        gdown.download(gdrive_url, output_path, quiet=False)
        print(f"File downloaded and saved as {output_path}")
        
        # Extract the downloaded zip file
        extract_zip(output_path, extract_to_folder)

        # Move the extracted folder and clean up
        shutil.move(extract_to_folder, output_folder_path)
        os.remove(output_path)
        print(f"Moved extracted files to {output_folder_path} and removed {output_path}")
    except Exception as e:
        print(f"An error occurred during the download or extraction process: {e}")

download_and_extract(gdrive_url, output_path, extract_to_folder, output_folder_path)

# Data Exploration

In [6]:
# Define the correct path for the screenplay data
texts_path = os.path.join(output_folder_path, "screenplay_data", "data", "raw_texts", "raw_texts")

# Initialize dictionary to store file names and contents
screenplays = {}

# List and iterate over all files in the folder
for file_name in os.listdir(texts_path):
    file_path = os.path.join(texts_path, file_name)
    
    # Ensure the path is an actual file before reading
    if os.path.isfile(file_path):
        
        try:
            # Read and store file content
            with open(file_path, 'r', encoding='latin-1') as f:
                screenplays[file_name] = f.read()
        except Exception as e:
            print(f"Error reading {file_name}: {e}")

# Print a sample of the first ten files to check
for i, (file_name, content) in enumerate(screenplays.items()):
    if i == 10:
        break
    print(f"Example of {file_name}:\n")
    print(content[:100])  # Print the first 100 characters as a sample
    print("-" * 50)

Example of 10 Cloverfield Lane_1179933.txt:

The Cellar

by
Josh Campbell & Matt Stuecken
DARKNESS

And then --

A GUNNED ENGINE --

BLURRED HEA
--------------------------------------------------
Example of 10 Things I Hate About You_0147800.txt:


                               TEN THINGS I HATE ABOUT YOU
          
                written by Ka
--------------------------------------------------
Example of 101 Days of 101 Dalmatians_0249328.txt:

107
#2
40] _DALMATIANS MARCH 17, 1995

EXT. SKY. FULL MOON

A huge, yellow moon. CAMERA TILTS DOWN 
--------------------------------------------------
Example of 12 Angry Men_0118528.txt:

PLEASE COPY AND RETURN |

âââ_ââââ_

 

TWELVE ANGRY MEN

by Reginald Rose

THE WRITIN
--------------------------------------------------
Example of 12 Monkeys_0114746.txt:


				TWELVE MONKEYS
	    
		          An original screenplay by

				David Peoples
  				     &amp;

--------------------------------------------------
Example of 12 Yea

In [7]:
# Set display option for better visibility
pd.set_option('display.max_columns', 25)

# Define the path to the metadata CSV file
csv_path = os.path.join(output_folder_path, 'movie_metadata', 'movie_meta_data.csv')

# Ensure the file exists before trying to read
if os.path.exists(csv_path):
    # Read the CSV file
    meta_df = pd.read_csv(csv_path)
    
    # Print the column names
    print(meta_df.columns)
    
    # Display the first few rows of the dataframe
    print(meta_df.head())
else:
    print(f"File {csv_path} does not exist.")

Index(['imdbid', 'title', 'akas', 'year', 'metascore', 'imdb user rating',
       'number of imdb user votes', 'awards', 'opening weekend', 'producers',
       'budget', 'script department', 'production companies', 'writers',
       'directors', 'casting directors', 'cast', 'countries', 'age restrict',
       'plot', 'plot outline', 'keywords', 'genres', 'taglines', 'synopsis'],
      dtype='object')
   imdbid                   title  \
0  120770  A Night at the Roxbury   
1  132512          At First Sight   
2  118661            The Avengers   
3  215545              Bamboozled   
4  118715        The Big Lebowski   

                                                akas  year  metascore  \
0  Une nuit au Roxbury (France), Movida en el Rox...  1998         26   
1  Sight Unseen (United States), Premier regard (...  1999         40   
2  Chapeau melon et bottes de cuir (France), Mit ...  1998         12   
3  The Very Black Show (France), It's Showtime (G...  2000         54   
4  El gr

The columns of interest include:

- Title
- Age restriction
- Year: This could be useful for analyzing shifts in cultural norms over time. For instance, certain curse words may have led to an MA rating in the 1960s but not in the 2020s.
- Budget and Opening weekend: These may be important for studying the effect of movie classification on its financial performance.
- IMDb ID: This might be relevant for linking additional data from the IMDb database.

In [10]:
# Extract movie titles and IDs in one step using list comprehension
movie_titles, ids = zip(*[(f.split("_", 1)[0], f.split("_", 1)[1].split(".", 1)[0]) for f in screenplays.keys()])

# Print the first 10 titles and IDs
for i, (title, id) in enumerate(zip(movie_titles, ids)):
    if i == 10:
        break
    print(f"Title: {title}  ID: {id}")

Title: 10 Cloverfield Lane  ID: 1179933
Title: 10 Things I Hate About You  ID: 0147800
Title: 101 Days of 101 Dalmatians  ID: 0249328
Title: 12 Angry Men  ID: 0118528
Title: 12 Monkeys  ID: 0114746
Title: 12 Years a Slave  ID: 2024544
Title: 127 Hours  ID: 1542344
Title: 13 13 13  ID: 2991516
Title: 1408  ID: 0450385
Title: 1492 Conquest of Paradise  ID: 0103594


# Dataset Preprocessing

In [18]:
# Create a DataFrame directly from screenplays, extracting ids from the keys
screenplays_df = pd.DataFrame({
    'imdbid': [os.path.splitext(f.split("_", 1)[1])[0].replace(".txt", "") for f in screenplays.keys()],
    'screenplay': screenplays.values()
})

screenplays_df.head()

Unnamed: 0,imdbid,screenplay
0,1179933,The Cellar\n\nby\nJosh Campbell & Matt Stuecke...
1,147800,\n TEN THINGS I ...
2,249328,"107\n#2\n40] _DALMATIANS MARCH 17, 1995\n\nEX..."
3,118528,PLEASE COPY AND RETURN |\n\nâââ_âââ...
4,114746,\n\t\t\t\tTWELVE MONKEYS\n\t \n\t\t ...


In [19]:
print(meta_df.info())
print(screenplays_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2858 entries, 0 to 2857
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   imdbid                     2858 non-null   int64 
 1   title                      2858 non-null   object
 2   akas                       2652 non-null   object
 3   year                       2858 non-null   int64 
 4   metascore                  2858 non-null   int64 
 5   imdb user rating           2858 non-null   int64 
 6   number of imdb user votes  2858 non-null   int64 
 7   awards                     2243 non-null   object
 8   opening weekend            1739 non-null   object
 9   producers                  2640 non-null   object
 10  budget                     1624 non-null   object
 11  script department          2220 non-null   object
 12  production companies       2682 non-null   object
 13  writers                    2696 non-null   object
 14  director

In [22]:
# Ensure 'imdbid' is of the correct type in both DataFrames before merging, if needed
if screenplays_df['imdbid'].dtype != 'int':
    screenplays_df['imdbid'] = screenplays_df['imdbid'].astype(int)

# Merge with metadata on 'imdbid'
df = meta_df.merge(screenplays_df, on='imdbid')

df.head()

Unnamed: 0,imdbid,title,akas,year,metascore,imdb user rating,number of imdb user votes,awards,opening weekend,producers,budget,script department,...,directors,casting directors,cast,countries,age restrict,plot,plot outline,keywords,genres,taglines,synopsis,screenplay
0,120770,A Night at the Roxbury,"Une nuit au Roxbury (France), Movida en el Rox...",1998,26,6,56537,,United States:,"Marie Cantin, Erin Fraser, Amy Heckerling, Ste...","$17,000,000 (estimated)",,...,John Fortenberry,Jeff Greenberg,"Will Ferrell, Chris Kattan, Raquel Gardner, Vi...",United States,"Argentina:13, Australia:M, Brazil:14, Canada:P...",Two dim-witted brothers dream of owning their ...,"The Roxbury Guys, Steve and Doug Butabi, want ...","woman-on-top, nightclub, car-accident, 1990s, ...","Comedy, Music, Romance",Score!,,\n\n\t\t\t A NIGHT AT THE ROXBURY \n\n\n\t\...
1,132512,At First Sight,"Sight Unseen (United States), Premier regard (...",1999,40,6,12922,,United States:,"Rob Cowan, Roger Paradiso, Irwin Winkler","$60,000,000 (estimated)",,...,Irwin Winkler,"Kerry Barden, Billy Hopkins, Suzanne Smith","Val Kilmer, Mira Sorvino, Kelly McGillis, Stev...",United States,"Argentina:13, Australia:M, Canada:PG::(Alberta...",A blind man has an operation to regain his sig...,First Sight is true to the title from start to...,"visual-agnosia, brother-sister-relationship, r...","Drama, Romance","Only Love Can Bring You To Your Senses., Scien...",,AT FIRST SIGHT\n\nEXT. VALLEY - DUSK \nGold li...
2,118661,The Avengers,"Chapeau melon et bottes de cuir (France), Mit ...",1998,12,3,40784,"FMCJ Award 1998, Golden Reel Award 1999, Razzi...","United States: $10,305,957, 16 Aug 1998","Susan Ekins, Jerry Weintraub","$60,000,000 (estimated)","Sharon Mansfield, Anna Worley",...,Jeremiah S. Chechik,Susie Figgis,"Ralph Fiennes, Uma Thurman, Sean Connery, Patr...",United States,"Argentina:13, Australia:PG, Brazil:10, Canada:...",Two British Agents team up to stop Sir August ...,"British Ministry Agent John Steed, under direc...","good-versus-evil, heroine, evil-man, villain, ...","Action, Adventure, Sci-Fi, Thriller","Mrs. Peel, we're needed., Extraordinary crimes...",,\n\n\t\t\t\t\tTHE AVENGERS\n\n\t\t\t\tScreenpl...
3,215545,Bamboozled,"The Very Black Show (France), It's Showtime (G...",2000,54,6,10373,"Golden Berlin Bear 2001, Black Reel 2001, Imag...",United States:,"Jon Kilik, Spike Lee, Kisha Imani Cameron","$10,000,000 (estimated)","Shari L. Carpenter, Carolyn De Sousa",...,Spike Lee,Aisha Coley,"Damon Wayans, Savion Glover, Jada Pinkett Smit...",United States,"Australia:MA, Finland:K-15, France:Tous public...",A frustrated African-American TV writer propos...,"Dark, biting satire of the television industry...","television-industry, african-american, referen...","Comedy, Drama, Music",Starring the great negroe actors,"In a New York City residence, Pierre Delacroix...",\t\t\t\tBamboozled\n\n\t\t\t\tby\n\n\t\t\t\tSp...
4,118715,The Big Lebowski,"El gran Lebowski (Spain), O Grande Lebowski (P...",1998,71,8,724388,"Honorable Mention 1998, ACCA 1998, Golden Berl...","United States: $5,533,844, 08 Mar 1998","Tim Bevan, John Cameron, Ethan Coen, Eric Fell...","$15,000,000 (estimated)",T. Kukovinski,...,"Joel Coen, Ethan Coen",John S. Lyons,"Jeff Bridges, John Goodman, Julianne Moore, St...","United States, United Kingdom","Argentina:16, Argentina:18::(cable rating), Au...","Jeff ""The Dude"" Lebowski, mistaken for a milli...","When ""the dude"" Lebowski is mistaken for a mil...","rug, nihilism, pornographer, bowling-alley, de...","Comedy, Crime, Sport",Hay quienes tratan de ganarse la vida sin move...,A tumbleweed rolls up a hillside just outside ...,\n\n\t\t\tTHE BIG LEBOWSKI\n\nWe are floating ...


In [23]:
# Create a lean version of the dataframe with only relevant columns for predicting age restrict classification
df_lean = df[['imdbid', 'title', 'year', 'opening weekend', 'budget', 'age restrict', 'genres', 'screenplay']]
df_lean.head()

Unnamed: 0,imdbid,title,year,opening weekend,budget,age restrict,genres,screenplay
0,120770,A Night at the Roxbury,1998,United States:,"$17,000,000 (estimated)","Argentina:13, Australia:M, Brazil:14, Canada:P...","Comedy, Music, Romance",\n\n\t\t\t A NIGHT AT THE ROXBURY \n\n\n\t\...
1,132512,At First Sight,1999,United States:,"$60,000,000 (estimated)","Argentina:13, Australia:M, Canada:PG::(Alberta...","Drama, Romance",AT FIRST SIGHT\n\nEXT. VALLEY - DUSK \nGold li...
2,118661,The Avengers,1998,"United States: $10,305,957, 16 Aug 1998","$60,000,000 (estimated)","Argentina:13, Australia:PG, Brazil:10, Canada:...","Action, Adventure, Sci-Fi, Thriller",\n\n\t\t\t\t\tTHE AVENGERS\n\n\t\t\t\tScreenpl...
3,215545,Bamboozled,2000,United States:,"$10,000,000 (estimated)","Australia:MA, Finland:K-15, France:Tous public...","Comedy, Drama, Music",\t\t\t\tBamboozled\n\n\t\t\t\tby\n\n\t\t\t\tSp...
4,118715,The Big Lebowski,1998,"United States: $5,533,844, 08 Mar 1998","$15,000,000 (estimated)","Argentina:16, Argentina:18::(cable rating), Au...","Comedy, Crime, Sport",\n\n\t\t\tTHE BIG LEBOWSKI\n\nWe are floating ...


In [24]:
df_lean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2853 entries, 0 to 2852
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   imdbid           2853 non-null   int64 
 1   title            2853 non-null   object
 2   year             2853 non-null   int64 
 3   opening weekend  1736 non-null   object
 4   budget           1623 non-null   object
 5   age restrict     2524 non-null   object
 6   genres           2841 non-null   object
 7   screenplay       2853 non-null   object
dtypes: int64(2), object(6)
memory usage: 178.4+ KB


In [27]:
# Assess missing values in 'age restrict' and print the shape in a single step
missing_values = df_lean['age restrict'].isnull().sum()
df_lean.shape, missing_values

((2853, 8), 329)

In [28]:
# Drop rows with missing values in 'age restrict' and check the shape in a single step
df_clean = df_lean.dropna(subset=['age restrict'])
df_clean.shape, df_clean['age restrict'].isnull().sum()

((2524, 8), 0)

In [29]:
# Compile the pattern once outside the function to avoid repeated compilation
pattern = re.compile(r', Australia:(G|PG|M|MA|MA15\+|R), ')

def find_aus_classification(string):
    match = pattern.search(string)  # Use the precompiled pattern
    return match.group(1) if match else pd.NA

aus_classifications = df_clean['age restrict'].apply(find_aus_classification)
aus_classifications

0           M
1           M
2          PG
3        <NA>
4          MA
        ...  
2848     <NA>
2849       PG
2850    MA15+
2851       PG
2852    MA15+
Name: age restrict, Length: 2524, dtype: object

In [30]:
# Create a dataset with Australian classifications
df_clean['age restrict aus'] = aus_classifications
df_clean.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['age restrict aus'] = aus_classifications


Unnamed: 0,imdbid,title,year,opening weekend,budget,age restrict,genres,screenplay,age restrict aus
0,120770,A Night at the Roxbury,1998,United States:,"$17,000,000 (estimated)","Argentina:13, Australia:M, Brazil:14, Canada:P...","Comedy, Music, Romance",\n\n\t\t\t A NIGHT AT THE ROXBURY \n\n\n\t\...,M
1,132512,At First Sight,1999,United States:,"$60,000,000 (estimated)","Argentina:13, Australia:M, Canada:PG::(Alberta...","Drama, Romance",AT FIRST SIGHT\n\nEXT. VALLEY - DUSK \nGold li...,M
2,118661,The Avengers,1998,"United States: $10,305,957, 16 Aug 1998","$60,000,000 (estimated)","Argentina:13, Australia:PG, Brazil:10, Canada:...","Action, Adventure, Sci-Fi, Thriller",\n\n\t\t\t\t\tTHE AVENGERS\n\n\t\t\t\tScreenpl...,PG
3,215545,Bamboozled,2000,United States:,"$10,000,000 (estimated)","Australia:MA, Finland:K-15, France:Tous public...","Comedy, Drama, Music",\t\t\t\tBamboozled\n\n\t\t\t\tby\n\n\t\t\t\tSp...,
4,118715,The Big Lebowski,1998,"United States: $5,533,844, 08 Mar 1998","$15,000,000 (estimated)","Argentina:16, Argentina:18::(cable rating), Au...","Comedy, Crime, Sport",\n\n\t\t\tTHE BIG LEBOWSKI\n\nWe are floating ...,MA


In [31]:
# Drop rows with missing values in 'age restrict aus' and output the shape
df_aus = df_clean.dropna(subset=['age restrict aus'])
df_aus.shape

(1891, 9)

# Modelling