# Code is split into 2 main parts
# 1. Data visualisation
# 2. NLP using gpt2-medium

# Data visualisation

In [None]:
# Basic Libraries
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt 
sb.set() 
plt.show()

In [None]:
moviesdata = pd.read_csv("tmdb_5000_movies.csv")
moviesdata.head()

In [None]:
moviesdata.shape

In [None]:
movies = moviesdata.loc[:, ["title","budget", "revenue", "production_companies", "genres"]]
# dataframe.loc[rows, columns]
movies.head()

In [None]:
mask = movies.loc[movies['revenue'] == 0]
mask1 = movies.loc[movies['revenue'] <= 500000]
indexes_rev = mask.index.tolist()
indexes_rev = mask1.index.tolist()

mask = movies.loc[movies['budget'] == 0]
mask1 = movies.loc[movies['budget'] <= 500000]
indexes_budget = mask.index.tolist()
indexes_budget = mask1.index.tolist()

indexes_budget.extend(indexes_rev)
list(set(indexes_budget))
revenue = movies['revenue'].drop(indexes_budget)
budget = movies['budget'].drop(indexes_budget)
profit = revenue - budget
movies = movies.drop(index=indexes_budget)
movies["profit"] = movies["revenue"] - movies["budget"]
movies.head(84)

In [None]:
data = {'Profit' : profit,
        'Budget' : budget,
        'Revenue' : revenue}
df = pd.DataFrame(data)
corr = df.corr()
# Calculate the correlation matrixcorr = df.corr()
# Create the heatmap using Seaborn
sb.heatmap(corr, vmin=-1, vmax=1, annot=True, cmap='coolwarm')
# Show the plot
plt.show()

In [None]:
movies.isnull().sum()

In [None]:
movies.duplicated().sum()

In [None]:
movies.drop_duplicates(inplace=True)
movies.duplicated().sum()

In [None]:
import ast
def convert_cast(obj): # convert string of list to list 
    L = []
    for i in ast.literal_eval(obj):
# ast convert the string of the list into an actual list object. 
# literal_eval function evaluate the string representation of a list.
        L.append(i['name'])
    return L

In [None]:
def convert_company(obj):
    counter = 0
    L = []
    for i in ast.literal_eval(obj):
        if counter !=1:
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

In [None]:
movies['genres'] = movies['genres'].apply(convert_cast)
movies.head()

In [None]:
movies['production_companies'] = movies['production_companies'].apply(convert_company)
movies.head()

In [None]:
# Determine the maximum number of items in the lists
max_items = movies['genres'].apply(len).max()
max_items
# Split the 'genres' column into separate columns
for i in range(max_items):
    movies[f'genres_{i+1}'] = movies['genres'].apply(lambda x: x[i] if len(x) > i else None)

# Drop the original 'genres' column if desired
movies = movies.drop(columns='genres')
movies.head()

In [None]:
# Calculate the 90th percentile value of the 'numbers' column
quantile = movies['profit'].quantile(0.90)

# Filter the top 10% values of the 'numbers' column
top_10_percent = movies[movies['profit'] > quantile]
top_10_percent

In [None]:
movies.shape

In [None]:
top_10_percent.reset_index()

In [None]:
top_10_percent_genres= top_10_percent[['genres_1', 'genres_2', 'genres_3', 'genres_4', 'genres_5','genres_6', 
                                              'genres_7']].melt().dropna()['value']
top_10_percent_genres

In [None]:
f, axes = plt.subplots(1,1,figsize=(17,10))
sb.countplot(x=top_10_percent_genres,order=top_10_percent_genres.value_counts().index)
axes.tick_params(axis='x', rotation=45)
axes.set_title('Top 10% movies Genres')
axes.set_ylabel('Number of movies')
axes.set_xlabel('Genre')

In [None]:
top_10_percent_genres_counts = top_10_percent_genres.value_counts()
top_3_genres = top_10_percent_genres_counts.head(3)
print(top_3_genres)

In [None]:
movies = top_10_percent.sort_values(by='profit', ascending=False)
movies.head()

In [None]:
mask = (movies["genres_1"].isin(["Adventure", "Action", "Comedy"]) |
        movies["genres_2"].isin(["Adventure", "Action", "Comedy"]) |
        movies["genres_3"].isin(["Adventure", "Action", "Comedy"]) |
        movies["genres_4"].isin(["Adventure", "Action", "Comedy"]) |
        movies["genres_5"].isin(["Adventure", "Action", "Comedy"]) |
        movies["genres_6"].isin(["Adventure", "Action", "Comedy"]) |
        movies["genres_7"].isin(["Adventure", "Action", "Comedy"]))

filtered_movies = movies[mask]
filtered_movies.head()

In [None]:
filtered_movies.shape

In [None]:
credits = pd.read_csv("tmdb_5000_credits.csv")
credits.head()

In [None]:
# Merge the dataframes on the common column names
merged_df = pd.merge(filtered_movies, credits, on='title') # Merge the Data Basd on the Title

# Print the merged dataframe
merged_df.head()

In [None]:
merged_df = merged_df.drop('movie_id', axis=1) # axis = 1 ensure the method operates along the columns, and not the rows.

merged_df.head()

In [None]:
def director(obj):
    L=[]
    for i in ast.literal_eval(obj):
        if i['job']=='Director': # Example: "job": "Director", "name": "Andrew Stanton"
            L.append(i['name'])
            break         
    return L

In [None]:
merged_df['crew'] = merged_df['crew'].apply(director) # Change it into a list
merged_df.head()

In [None]:
def convert_cast(obj):
    counter = 0
    L = []
    for i in ast.literal_eval(obj):
        if counter !=3:  # We only want the top 3 names
            L.append(i['name'])
            counter += 1
        else:
            break
    return L

In [None]:
merged_df['cast'] = merged_df['cast'].apply(convert_cast)
merged_df.head()

In [None]:
# Split the 'crew' column into separate columns
for i in range(1):
    merged_df[f'director'] = merged_df['crew'].apply(lambda x: x[i] if len(x) > i else None)
#The lambda function is using the indexing operator ([i]) to extract the ith element from the list stored in x.
#it is Anonymous functions are functions that are defined without a name
#If the length of the list x is less than or equal to i, then the lambda function returns None
# Drop the original 'crew' column 
merged_df = merged_df.drop(columns='crew')
merged_df.head()

In [None]:
# Initialize empty dictionaries to count the number of movies in each genre for each director
action_counts = {}
adventure_counts = {}
comedy_counts = {}

# Loop through each row of the merged_df DataFrame
for index, row in merged_df.iterrows(): 
    # Iterrows iterates over each rows of data as a typle of index and row 
    director = row["director"]
    for i in range(1, 8):
        genre = row[f"genres_{i}"]
        if genre == "Action":
            if director not in action_counts:
                action_counts[director] = 0
            action_counts[director] += 1
        elif genre == "Adventure":
            if director not in adventure_counts:
                adventure_counts[director] = 0
            adventure_counts[director] += 1
        elif genre == "Comedy":
            if director not in comedy_counts:
                comedy_counts[director] = 0
            comedy_counts[director] += 1

# Sort the dictionaries by the number of movies in each genre in descending order
top_action_directors = sorted(action_counts, key=action_counts.get, reverse=True)[:3]
# Reverse = Descending Order and [:3] is to get the top 3 directors
top_adventure_directors = sorted(adventure_counts, key=adventure_counts.get, reverse=True)[:3]
top_comedy_directors = sorted(comedy_counts, key=comedy_counts.get, reverse=True)[:3]

# Print out the top 3 directors for each genre
print("Top 3 Directors with the Most Action Movies:")
for i, director in enumerate(top_action_directors):
    print(f"{i+1}. {director} ({action_counts[director]} Action movies)")

print("\nTop 3 Directors with the Most Adventure Movies:")
for i, director in enumerate(top_adventure_directors):
    print(f"{i+1}. {director} ({adventure_counts[director]} Adventure movies)")

print("\nTop 3 Directors with the Most Comedy Movies:")
for i, director in enumerate(top_comedy_directors):
    print(f"{i+1}. {director} ({comedy_counts[director]} Comedy movies)")

In [None]:
counts = []
for director in top_action_directors:
    counts.append({'Director': director , 'Genre': 'Action', 'Count': action_counts[director]})
for director in top_adventure_directors:
    counts.append({'Director': director, 'Genre': 'Adventure', 'Count': adventure_counts[director]})
for director in top_comedy_directors:
    counts.append({'Director': director, 'Genre': 'Comedy', 'Count': comedy_counts[director]})
counts_df = pd.DataFrame(counts)

counts_df

In [None]:
# Create separate count DataFrames for each genre
action_df = counts_df[counts_df['Genre'] == 'Action']
adventure_df = counts_df[counts_df['Genre'] == 'Adventure']
comedy_df = counts_df[counts_df['Genre'] == 'Comedy']

# Create the figure and subplots
f, axes = plt.subplots(1, 3, figsize=(18, 6), sharey = True)


# Create the count plot for Action movies using seaborn
sb.barplot(x="Director", y="Count", data=action_df, ax=axes[0], color='blue')
axes[0].set_xlabel('Director')
axes[0].set_ylabel('Movies')
axes[0].set_title('Count of Action Movies For Director')

# Create the count plot for Adventure movies using seaborn
sb.barplot(x="Director", y="Count", data=adventure_df, ax=axes[1], color='green')
axes[1].set_xlabel('Director')
axes[1].set_ylabel('Movies')
axes[1].set_title('Count of Adventure Movies For Director')

# Create the count plot for Comedy movies using seaborn
sb.barplot(x="Director", y="Count", data=comedy_df, ax=axes[2],color='red')
axes[2].set_xlabel('Director')
axes[2].set_ylabel('Movies')
axes[2].set_title('Count of Comedy Movies For Director')

plt.suptitle("Top 3 Directors with the Most Movies in Each Genre")

In [None]:
# Split the 'cast' column into separate columns
for i in range(3):
    merged_df[f'cast_{i+1}'] = merged_df['cast'].apply(lambda x: x[i] if len(x) > i else None)
#The lambda function is using the indexing operator ([i]) to extract the ith element from the list stored in x.
#it is Anonymous functions are functions that are defined without a name
#If the length of the list x is less than or equal to i, then the lambda function returns None

# Drop the original 'cast' column 
merged_df = merged_df.drop(columns='cast')
merged_df.head()

In [None]:
# Initialize empty dictionaries to count the number of movies in each genre for each actor
action_counts = {}
adventure_counts = {}
comedy_counts = {}

# Loop through each row of the merged_df DataFrame and count for the "cast_1" column
for index, row in merged_df.iterrows():
    actor = row["cast_1"] 
    if actor != "":
        for i in range(1, 8):
            genre = row[f"genres_{i}"]
            if genre == "Action":
                if actor not in action_counts:
                    action_counts[actor] = 0
                action_counts[actor] += 1
            elif genre == "Adventure":
                if actor not in adventure_counts:
                    adventure_counts[actor] = 0
                adventure_counts[actor] += 1
            elif genre == "Comedy":
                if actor not in comedy_counts:
                    comedy_counts[actor] = 0
                comedy_counts[actor] += 1

# Sort the dictionaries by the number of movies in each genre in descending order
top_action_actors = sorted(action_counts, key=action_counts.get, reverse=True)[:3]
top_adventure_actors = sorted(adventure_counts, key=adventure_counts.get, reverse=True)[:3]
top_comedy_actors = sorted(comedy_counts, key=comedy_counts.get, reverse=True)[:3]

# Print out the top 3 actors for each genre
print("Top 3 Actors in the Most Action Movies:")
for i, actor in enumerate(top_action_actors):
    print(f"{i+1}. {actor} ({action_counts[actor]} Action movies)")

print("\nTop 3 Actors in the Most Adventure Movies:")
for i, actor in enumerate(top_adventure_actors):
    print(f"{i+1}. {actor} ({adventure_counts[actor]} Adventure movies)")

print("\nTop 3 Actors in the Most Comedy Movies:")
for i, actor in enumerate(top_comedy_actors):
    print(f"{i+1}. {actor} ({comedy_counts[actor]} Comedy movies)")

In [None]:
counts = []
for actor in top_action_actors:
    counts.append({'Actor': actor, 'Genre': 'Action', 'Count': action_counts[actor]})
for actor in top_adventure_actors:
    counts.append({'Actor': actor, 'Genre': 'Adventure', 'Count': adventure_counts[actor]})
for actor in top_comedy_actors:
    counts.append({'Actor': actor, 'Genre': 'Comedy', 'Count': comedy_counts[actor]})
counts_df = pd.DataFrame(counts)

counts_df

In [None]:
# Create separate count DataFrames for each genre
action_df = counts_df[counts_df['Genre'] == 'Action']
adventure_df = counts_df[counts_df['Genre'] == 'Adventure']
comedy_df = counts_df[counts_df['Genre'] == 'Comedy']

# Create the figure and subplots
f, axes = plt.subplots(1, 3, figsize=(18, 6), sharey = True)

# Create the count plot for Action movies using seaborn
sb.barplot(x="Actor", y="Count", data=action_df, ax=axes[0],color='blue')
axes[0].set_xlabel('Actor')
axes[0].set_ylabel('Movies')
axes[0].set_title('Count of Action Movies For Top 3 Actors 1')

# Create the count plot for Adventure movies using seaborn
sb.barplot(x="Actor", y="Count", data=adventure_df, ax=axes[1],color='green')
axes[1].set_xlabel('Actor')
axes[1].set_ylabel('Movies')
axes[1].set_title('Count of Adventure Movies For Top 3 Actors 1')

# Create the count plot for Comedy movies using seaborn
sb.barplot(x="Actor", y="Count", data=comedy_df, ax=axes[2],color='red')
axes[2].set_xlabel('Actor')
axes[2].set_ylabel('Movies')
axes[2].set_title('Count of Comedy Movies For Top 3 Actors 1')

plt.suptitle("Top 3 Actor 1 by Genre")

In [None]:
# Initialize empty dictionaries to count the number of movies in each genre for each actor
action_counts = {}
adventure_counts = {}
comedy_counts = {}

# Loop through each row of the merged_df DataFrame and count for the "cast_1" column
for index, row in merged_df.iterrows():
    actor = row["cast_2"]
    if actor != "":
        for i in range(1, 8):
            genre = row[f"genres_{i}"]
            if genre == "Action":
                if actor not in action_counts:
                    action_counts[actor] = 0
                action_counts[actor] += 1
            elif genre == "Adventure":
                if actor not in adventure_counts:
                    adventure_counts[actor] = 0
                adventure_counts[actor] += 1
            elif genre == "Comedy":
                if actor not in comedy_counts:
                    comedy_counts[actor] = 0
                comedy_counts[actor] += 1

# Sort the dictionaries by the number of movies in each genre in descending order
top_action_actors = sorted(action_counts, key=action_counts.get, reverse=True)[:3]
top_adventure_actors = sorted(adventure_counts, key=adventure_counts.get, reverse=True)[:3]
top_comedy_actors = sorted(comedy_counts, key=comedy_counts.get, reverse=True)[:3]

# Print out the top 3 actors for each genre
print("Top 3 Actors 2 in the Most Action Movies:")
for i, actor in enumerate(top_action_actors):
    print(f"{i+1}. {actor} ({action_counts[actor]} Action movies)")

print("\nTop 3 Actors 2 in the Most Adventure Movies:")
for i, actor in enumerate(top_adventure_actors):
    print(f"{i+1}. {actor} ({adventure_counts[actor]} Adventure movies)")

print("\nTop 3 Actors 2 in the Most Comedy Movies:")
for i, actor in enumerate(top_comedy_actors):
    print(f"{i+1}. {actor} ({comedy_counts[actor]} Comedy movies)")

In [None]:
counts = []
for actor in top_action_actors:
    counts.append({'Actor': actor, 'Genre': 'Action', 'Count': action_counts[actor]})
for actor in top_adventure_actors:
    counts.append({'Actor': actor, 'Genre': 'Adventure', 'Count': adventure_counts[actor]})
for actor in top_comedy_actors:
    counts.append({'Actor': actor, 'Genre': 'Comedy', 'Count': comedy_counts[actor]})
counts_df = pd.DataFrame(counts)

counts_df

In [None]:
# Create separate count DataFrames for each genre
action_df = counts_df[counts_df['Genre'] == 'Action']
adventure_df = counts_df[counts_df['Genre'] == 'Adventure']
comedy_df = counts_df[counts_df['Genre'] == 'Comedy']

# Create the figure and subplots
f, axes = plt.subplots(1, 3, figsize=(18, 6),sharey = True)

# Create the count plot for Action movies using seaborn
sb.barplot(x="Actor", y="Count", data=action_df, ax=axes[0], color = "blue")
axes[0].set_xlabel('Actor')
axes[0].set_ylabel('Movies')
axes[0].set_title('Count Of Action Movies For Top 3 Actors 2')

# Create the count plot for Adventure movies using seaborn
sb.barplot(x="Actor", y="Count", data=adventure_df, ax=axes[1], color = "green")
axes[1].set_xlabel('Actor')
axes[1].set_ylabel('Movies')
axes[1].set_title('Count Of Adventure Movies For Top 3 Actors 2')

# Create the count plot for Comedy movies using seaborn
sb.barplot(x="Actor", y="Count", data=comedy_df, ax=axes[2], color = "red")
axes[2].set_xlabel('Actor')
axes[2].set_ylabel('Movies')
axes[2].set_title('Count Of Comedy Movies For Top 3 Actors 2')

plt.suptitle("Top 3 Actor 2 by Genre")

In [None]:
# Initialize empty dictionaries to count the number of movies in each genre for each actor
action_counts = {}
adventure_counts = {}
comedy_counts = {}

# Loop through each row of the merged_df DataFrame and count for the "cast_1" column
for index, row in merged_df.iterrows():
    actor = row["cast_3"]
    if actor != "":
        for i in range(1, 8):
            genre = row[f"genres_{i}"]
            if genre == "Action":
                if actor not in action_counts:
                    action_counts[actor] = 0
                action_counts[actor] += 1
            elif genre == "Adventure":
                if actor not in adventure_counts:
                    adventure_counts[actor] = 0
                adventure_counts[actor] += 1
            elif genre == "Comedy":
                if actor not in comedy_counts:
                    comedy_counts[actor] = 0
                comedy_counts[actor] += 1

# Sort the dictionaries by the number of movies in each genre in descending order
top_action_actors = sorted(action_counts, key=action_counts.get, reverse=True)[:3]
top_adventure_actors = sorted(adventure_counts, key=adventure_counts.get, reverse=True)[:3]
top_comedy_actors = sorted(comedy_counts, key=comedy_counts.get, reverse=True)[:3]

# Print out the top 3 actors for each genre
print("Top 3 Actors 3 in the Most Action Movies:")
for i, actor in enumerate(top_action_actors):
    print(f"{i+1}. {actor} ({action_counts[actor]} Action movies)")

print("\nTop 3 Actors 3 in the Most Adventure Movies:")
for i, actor in enumerate(top_adventure_actors):
    print(f"{i+1}. {actor} ({adventure_counts[actor]} Adventure movies)")

print("\nTop 3 Actors 3 in the Most Comedy Movies:")
for i, actor in enumerate(top_comedy_actors):
    print(f"{i+1}. {actor} ({comedy_counts[actor]} Comedy movies)")

In [None]:
counts = []
for actor in top_action_actors:
    counts.append({'Actor': actor, 'Genre': 'Action', 'Count': action_counts[actor]})
for actor in top_adventure_actors:
    counts.append({'Actor': actor, 'Genre': 'Adventure', 'Count': adventure_counts[actor]})
for actor in top_comedy_actors:
    counts.append({'Actor': actor, 'Genre': 'Comedy', 'Count': comedy_counts[actor]})
counts_df = pd.DataFrame(counts)

counts_df

In [None]:
# Create separate count DataFrames for each genre
action_df = counts_df[counts_df['Genre'] == 'Action']
adventure_df = counts_df[counts_df['Genre'] == 'Adventure']
comedy_df = counts_df[counts_df['Genre'] == 'Comedy']

# Create the figure and subplots
f, axes = plt.subplots(1, 3, figsize=(18, 6), sharey = True)

# Create the count plot for Action movies using seaborn
sb.barplot(x="Actor", y="Count", data=action_df, ax=axes[0],color = "blue")
axes[0].set_xlabel('Actor')
axes[0].set_ylabel('Movies')
axes[0].set_title('Count Of Action Movies For Top 3 Actors 3')

# Create the count plot for Adventure movies using seaborn
sb.barplot(x="Actor", y="Count", data=adventure_df, ax=axes[1], color = "green")
axes[1].set_xlabel('Actor')
axes[1].set_ylabel('Movies')
axes[1].set_title('Count Of Adventure Movies For Top 3 Actors 3')

# Create the count plot for Comedy movies using seaborn
sb.barplot(x="Actor", y="Count", data=comedy_df, ax=axes[2], color = "red")
axes[2].set_xlabel('Actor')
axes[2].set_ylabel('Movies')
axes[2].set_title('Count Of Comedy Movies For Top 3 Actors 3')

plt.suptitle("Top 3 Actor 3 by Genre")

In [None]:
# Initialize empty dictionaries to count the number of movies in each genre for each production company
action_counts = {}
adventure_counts = {}
comedy_counts = {}

# Loop through each row of the prod_com DataFrame
for index, row in merged_df.iterrows():
    companies = row['production_companies']
    for i in range(1, 8):
        genre = row[f"genres_{i}"]
        if genre == "Action":
            for company in companies:
                if company not in action_counts:
                    action_counts[company] = 0
                action_counts[company] += 1
        elif genre == "Adventure":
            for company in companies:
                if company not in adventure_counts:
                    adventure_counts[company] = 0
                adventure_counts[company] += 1
        elif genre == "Comedy":
            for company in companies:
                if company not in comedy_counts:
                    comedy_counts[company] = 0
                comedy_counts[company] += 1

# Sort the dictionaries by the number of movies in each genre in descending order
top_action_companies = sorted(action_counts, key=action_counts.get, reverse=True)[:3]
top_adventure_companies = sorted(adventure_counts, key=adventure_counts.get, reverse=True)[:3]
top_comedy_companies = sorted(comedy_counts, key=comedy_counts.get, reverse=True)[:3]

# Print out the top 3 production companies for each genre
print("Top 3 Production Companies with the Most Action Movies:")
for i, company in enumerate(top_action_companies):
    print(f"{i+1}. {company} ({action_counts[company]} Action movies)")

print("\nTop 3 Production Companies with the Most Adventure Movies:")
for i, company in enumerate(top_adventure_companies):
    print(f"{i+1}. {company} ({adventure_counts[company]} Adventure movies)")

print("\nTop 3 Production Companies with the Most Comedy Movies:")
for i, company in enumerate(top_comedy_companies):
    print(f"{i+1}. {company} ({comedy_counts[company]} Comedy movies)")

In [None]:
counts = []
for company in top_action_companies:
    counts.append({'Company': company, 'Genre': 'Action', 'Count': action_counts[company]})
for company in top_adventure_companies:
    counts.append({'Company': company, 'Genre': 'Adventure', 'Count': adventure_counts[company]})
for company in top_comedy_companies:
    counts.append({'Company': company, 'Genre': 'Comedy', 'Count': comedy_counts[company]})
counts_df = pd.DataFrame(counts)

counts_df

In [None]:
# Create separate count DataFrames for each genre
action_df = counts_df[counts_df['Genre'] == 'Action']
adventure_df = counts_df[counts_df['Genre'] == 'Adventure']
comedy_df = counts_df[counts_df['Genre'] == 'Comedy']

# Create the figure and subplots
f, axes = plt.subplots(1, 3, figsize=(18, 6), sharey = True)

# Create the count plot for Action movies using seaborn
sb.barplot(x="Company", y="Count", data=action_df, ax=axes[0],color = "blue")
axes[0].set_xlabel('Companies')
axes[0].set_ylabel('Movies')
axes[0].set_title('Count Of Action Movies For Top 3 Actors')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=90)

# Create the count plot for Adventure movies using seaborn
sb.barplot(x="Company", y="Count", data=adventure_df, ax=axes[1], color = "green")
axes[1].set_xlabel('Companies')
axes[1].set_ylabel('Movies')
axes[1].set_title('Count Of Adventure Movies For Top 3 Actors')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=90)

# Create the count plot for Comedy movies using seaborn
sb.barplot(x="Company", y="Count", data=comedy_df, ax=axes[2], color = "red")
axes[2].set_xlabel('Companies')
axes[2].set_ylabel('Movies')
axes[2].set_title('Count Of Comedy Movies For Top 3 Actors')
axes[2].set_xticklabels(axes[2].get_xticklabels(), rotation=90)

plt.suptitle("Top 3 Production Companies by Genre")

# NLP using gpt2-medium

In [None]:
merged_df.head()

In [None]:
top_movies = merged_df['title'].tolist()

# Define the lists for different genres
adventure_list = []
action_list = []
comedy_list = []

# Define the words to search for in each column
genre_words = ['Adventure', 'Action', 'Comedy']

# Loop through every row and column of the DataFrame
for index, row in merged_df.iterrows():
    match_genres = []  # Define an empty list to store the matching genres
    for i in range(1, 8):
        # Check if the genre column contains any of the genre_words
        if row[f'genres_{i}'] in genre_words:
            match_genres.append(row[f'genres_{i}'])
    # Append the movie title to the corresponding list(s) based on the matching genres
    if 'Adventure' in match_genres:
        adventure_list.append(row['title'])
    if 'Action' in match_genres:
        action_list.append(row['title'])
    if 'Comedy' in match_genres:
        comedy_list.append(row['title'])

# Print the resulting lists
print("combined:", top_movies)
print()
print("adventure:", adventure_list)
print()
print("action:", action_list)
print()
print("comedy:", comedy_list)

In [None]:
pip install bs4

Replace "top_movies" with the other 3 lists of movie titles when done

In [None]:
import requests
from bs4 import BeautifulSoup

count = 0
with open("movie_transcripts_with_numbers.txt", "a", encoding = "utf-8") as f:
    for movie in top_movies:
        # Format the URL for the movie script page
        url = f"http://www.imsdb.com/scripts/{movie.replace(' ', '-')}.html"
        
        # Send a request to the URL and check for successful response
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Failed to retrieve script for {movie}")
            continue
        
        # Use BeautifulSoup to parse the HTML response text
        soup = BeautifulSoup(response.text, "html.parser")
        
        # Extract the script text from the HTML tags
        script_text = soup.find("td", {"class": "scrtext"}).get_text()

        # Count the number of words in the script text
        word_count = len(script_text.split())
        
        # Check if the script has less than 1000 words, and skip the movie if it does
        if word_count < 1000:
            print(f"{movie} skipped due to less than 1000 words")
            continue

        
        # Append the script text to the file
        f.write(f"Movie Title: {movie}\n\n")
        f.write(script_text)
        f.write("\n\n")

        count += 1  # increment the count variable

print(f"Done! Scraped {count} movies.")

In [None]:
  # Open the input and output files
with open('movie_transcripts_with_numbers.txt', 'r') as infile, open('movie_transcripts.txt', 'w') as outfile:
    # Loop through each line in the input file
    for line in infile:
        # Remove any digits from the line
        line = ''.join(c for c in line if not c.isdigit())
        # Write the updated line to the output file
        outfile.write(line)

In [None]:
with open("movie_transcripts.txt", "r") as f:
    lines = f.readlines()

with open("movie_transcripts.txt", "w") as f:
    for line in lines:
        f.write(line.strip() + "\n")

In [None]:
with open("movie_transcripts.txt", "r") as f:
    lines = f.readlines()

with open("movie_transcripts.txt", "w") as f:
    for line in lines:
        if line.replace(".", "").strip() != "":
            f.write(line)

1. Create folder /storage, /storage/data and /storage/models 
2. Transfer cleaned "movie_transcripts.txt" into data folder


In [None]:
pip install transformers
pip install torch

In [None]:
import logging
import os
import pickle

import torch
import torch.nn as nn
import transformers
from torch.utils.data import DataLoader, Dataset, RandomSampler, SequentialSampler
from transformers import (
    GPT2Config,
    GPT2LMHeadModel,
    GPT2PreTrainedModel,
    GPT2Tokenizer,
    PreTrainedModel,
    PreTrainedTokenizer,
)

MODEL_CLASSES = {"gpt2": (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)}

FILE_PATH = os.path.join("storage", "data", "movie_transcripts.txt")

logger = logging.getLogger(__name__)

class ScriptData(Dataset):
    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
        file_path: str,
        block_size=512,
        overwrite_cache=False,
    ):
        assert os.path.isfile(file_path)

        block_size = block_size - (
            tokenizer.model_max_length - tokenizer.max_len_single_sentence
        )

        directory, filename = os.path.split(file_path)

        # change if args are added at later point
        cached_features_file = os.path.join(
            directory, "gpt2" + "_" + str(block_size) + "_" + filename
        )

        if os.path.exists(cached_features_file) and not overwrite_cache:
            logger.info(
                f"Loading features from your cached file {cached_features_file}"
            )
            with open(cached_features_file, "rb") as cache:
                self.examples = pickle.load(cache)
                logger.debug("Loaded examples from cache")
        else:
            logger.info(f"Creating features from file {filename} at {directory}")

            self.examples = []
            with open(file_path, encoding="utf-8") as f:
                text = f.read()
                logger.debug("Succesfully read text from file")

            tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

            for i in range(0, len(tokenized_text) - block_size + 1, block_size):
                self.examples.append(
                    tokenizer.build_inputs_with_special_tokens(
                        tokenized_text[i : i + block_size]
                    )
                )

            logger.info(f"Saving features into cached file {cached_features_file}")
            with open(cached_features_file, "wb") as cache:
                
                pickle.dump(self.examples, cache, protocol=pickle.HIGHEST_PROTOCOL)

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, item):
        return torch.tensor(self.examples[item], dtype=torch.long)

In [None]:
import torch
from torch.utils.data import Dataset
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup, WEIGHTS_NAME, CONFIG_NAME
import numpy as np
import os
import random

In [None]:
device = 'cpu'
if torch.cuda.is_available():
    device = 'cuda'

In [None]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
model = model.to(device)

In [None]:
FILE_PATH = os.path.join("storage","data", "movie_transcripts.txt")

In [None]:
dataset = ScriptData(tokenizer= tokenizer, file_path= FILE_PATH )
script_loader = DataLoader(dataset,batch_size=4,shuffle=True)

In [None]:
BATCH_SIZE = 7
EPOCHS = 1
LEARNING_RATE = 0.00002
WARMUP_STEPS = 10000

In [None]:
model = model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=-1)
script_count = 0
sum_loss = 0.0
batch_count = 0

In [None]:
output_dir = "./storage/models"

In [None]:
for epoch in range(EPOCHS):
    print(f"EPOCH {epoch} started" + '=' * 30)
    for idx,script in enumerate(script_loader):
        outputs = model(script.to(device), labels=script.to(device))
        
        loss, logits = outputs[:2]                        
        loss.backward()
        sum_loss = sum_loss + loss.detach().data
                       
        script_count = script_count + 1
        if script_count == BATCH_SIZE:
            script_count = 0    
            batch_count += 1
            optimizer.step()
            scheduler.step() 
            optimizer.zero_grad()
            model.zero_grad()
            
        if batch_count == 30:
            model.eval()
            print(f"sum loss {sum_loss}")
            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length = 1000,
                                    top_p=0.95, 
                                    num_return_sequences=1
                                )

            print("Output:\n" + 100 * '-')
            for i, sample_output in enumerate(sample_outputs):
                  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

            output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
            output_config_file = os.path.join(output_dir, CONFIG_NAME)

            torch.save(model.state_dict(), output_model_file)
            model.config.to_json_file(output_config_file)
            tokenizer.save_vocabulary(output_dir)
            
            batch_count = 0
            sum_loss = 0.0
            model.train()

In [None]:
model = GPT2LMHeadModel.from_pretrained(output_dir)
tokenizer = GPT2Tokenizer.from_pretrained(output_dir)

In [None]:
model.eval()

In [None]:
dataset = ScriptData(tokenizer= tokenizer, file_path= FILE_PATH )
script_loader = DataLoader(dataset,batch_size=4,shuffle=True)

In [None]:
BATCH_SIZE = 7
EPOCHS = 3
LEARNING_RATE = 0.00002
WARMUP_STEPS = 10000

In [None]:
model = model.to(device)
model.train()
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=-1)
script_count = 0
sum_loss = 0.0
batch_count = 0

In [None]:
for epoch in range(EPOCHS):
    print(f"EPOCH {epoch} started" + '=' * 30)
    for idx,script in enumerate(script_loader):
        outputs = model(script.to(device), labels=script.to(device))
        
        loss, logits = outputs[:2]                        
        loss.backward()
        sum_loss = sum_loss + loss.detach().data
                       
        script_count = script_count + 1
        if script_count == BATCH_SIZE:
            script_count = 0    
            batch_count += 1
            optimizer.step()
            scheduler.step() 
            optimizer.zero_grad()
            model.zero_grad()
            
        if batch_count == 30:
            model.eval()
            print(f"sum loss {sum_loss}")
            sample_outputs = model.generate(
                                    bos_token_id=random.randint(1,30000),
                                    do_sample=True,   
                                    top_k=50, 
                                    max_length = 1000,
                                    top_p=0.95, 
                                    num_return_sequences=1
                                )

            print("Output:\n" + 100 * '-')
            for i, sample_output in enumerate(sample_outputs):
                  print("{}: {}".format(i, tokenizer.decode(sample_output, skip_special_tokens=True)))

            output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
            output_config_file = os.path.join(output_dir, CONFIG_NAME)

            torch.save(model.state_dict(), output_model_file)
            model.config.to_json_file(output_config_file)
            tokenizer.save_vocabulary(output_dir)
            
            batch_count = 0
            sum_loss = 0.0
            model.train()

In [None]:
output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
output_config_file = os.path.join(output_dir, CONFIG_NAME)

torch.save(model.state_dict(), output_model_file)
model.config.to_json_file(output_config_file)
tokenizer.save_vocabulary(output_dir)       

In [None]:
import transformers
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import random
from transformers import AutoModel, AutoTokenizer, AutoModelWithLMHead
import os
import json

def load_model(model_dir=None):
    if model_dir is None:
      model_dir = './storage/models/'
    tokenizer = GPT2Tokenizer.from_pretrained(model_dir)
    model = GPT2LMHeadModel.from_pretrained(model_dir)
    return model, tokenizer

def generate(model, tokenizer, input_text=None, num_samples=1, max_length=1000):
    model.eval()
    
    if input_text:
        input_ids = tokenizer.encode(input_text, return_tensors='pt')
        output = model.generate(
            input_ids= input_ids,
            do_sample=True,   
            top_k=50, 
            max_length = max_length,
            top_p=0.95, 
            num_return_sequences= num_samples
        )
    else:
        output = model.generate(
            bos_token_id=random.randint(1,50000),
            do_sample=True,   
            top_k=50, 
            max_length = max_length,
            top_p=0.95, 
            num_return_sequences=num_samples

        )


    decoded_output = []
    for sample in output:
        decoded_output.append(tokenizer.decode(
            sample, skip_special_tokens=True))

    return decoded_output

In [None]:
model, tokenizer = load_model()

In [None]:
context = "The quick brown fox jumps over the lazy dog."

In [None]:
sample = generate(model,tokenizer,input_text=context,max_length=1000)

In [None]:
for line in sample:
    print(line)