In [641]:
import pandas as pd

In [642]:
import seaborn as sns
import matplotlib.pyplot as plt

In [643]:
movies = pd.read_csv('Horror Movies IMDb.csv')

In [644]:
movies.head(10)

Unnamed: 0,Movie Title,Movie Year,Runtime,Genre,Rating,Director,Votes,Gross
0,Alien,1979,117,"Horror, Sci-Fi",8.5,Ridley Scott,905275,$78.90M
1,Psycho,1960,109,"Horror, Mystery, Thriller",8.5,Alfred Hitchcock,689068,$32.00M
2,The Shining,1980,146,"Drama, Horror",8.4,Stanley Kubrick,1051582,$44.02M
3,The Thing,1982,109,"Horror, Mystery, Sci-Fi",8.2,John Carpenter,439793,$13.78M
4,Tumbbad,2018,104,"Drama, Fantasy, Horror",8.2,Rahi Anil Barve,53297,
5,The Exorcist,1973,122,Horror,8.1,William Friedkin,422330,$232.91M
6,Diabolique,1955,117,"Crime, Drama, Horror",8.1,Henri-Georges Clouzot,67456,$1.09M
7,Rosemary's Baby,1968,137,"Drama, Horror",8.0,Roman Polanski,223968,
8,What Ever Happened to Baby Jane?,1962,134,"Drama, Horror, Thriller",8.0,Robert Aldrich,58904,$4.05M
9,The Cabinet of Dr. Caligari,1920,67,"Horror, Mystery, Thriller",8.0,Robert Wiene,66749,


In [645]:
# Convert 'Gross' values to numerical format
movies['Gross'] = movies['Gross'].str.replace('$', '').str.replace('M', '').astype(float) * 1e6

movies['Gross'] = movies['Gross'].fillna(0).astype(int)

In [646]:
# Create a new column 'Subgenre' and initialize it with empty strings
movies['Subgenre'] = ''

# Identify rows where 'Genre' is not 'Horror'
non_horror_mask = movies['Genre'] != 'Horror'

# Copy the non-horror genres from 'Genre' to 'Subgenre'
movies.loc[non_horror_mask, 'Subgenre'] = movies.loc[non_horror_mask, 'Genre']

# Set 'Genre' as 'Horror' for all rows
movies['Genre'] = 'Horror'

# Remove 'Horror' or ', Horror' from the 'Subgenre' column
movies['Subgenre'] = movies['Subgenre'].str.replace(r'\bHorror\b|\s*,\s*Horror\b', '', regex=True)

movies['Subgenre'] = movies['Subgenre'].str.replace(r'^,', '', regex=True)

In [647]:
movies[['Movie Title', 'Genre', 'Subgenre']].head()

Unnamed: 0,Movie Title,Genre,Subgenre
0,Alien,Horror,Sci-Fi
1,Psycho,Horror,"Mystery, Thriller"
2,The Shining,Horror,Drama
3,The Thing,Horror,"Mystery, Sci-Fi"
4,Tumbbad,Horror,"Drama, Fantasy"


In [648]:
movies['Votes'] = movies['Votes'].str.replace(',', '')  # Remove commas from the string
movies['Votes'] = movies['Votes'].astype(int)  # Convert the column to integer

In [649]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 836 entries, 0 to 835
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Movie Title  836 non-null    object 
 1   Movie Year   836 non-null    int64  
 2   Runtime      836 non-null    int64  
 3   Genre        836 non-null    object 
 4   Rating       836 non-null    float64
 5   Director     836 non-null    object 
 6   Votes        836 non-null    int64  
 7   Gross        836 non-null    int64  
 8   Subgenre     836 non-null    object 
dtypes: float64(1), int64(4), object(4)
memory usage: 58.9+ KB


In [650]:
movies.head()

Unnamed: 0,Movie Title,Movie Year,Runtime,Genre,Rating,Director,Votes,Gross,Subgenre
0,Alien,1979,117,Horror,8.5,Ridley Scott,905275,78900000,Sci-Fi
1,Psycho,1960,109,Horror,8.5,Alfred Hitchcock,689068,32000000,"Mystery, Thriller"
2,The Shining,1980,146,Horror,8.4,Stanley Kubrick,1051582,44020000,Drama
3,The Thing,1982,109,Horror,8.2,John Carpenter,439793,13780000,"Mystery, Sci-Fi"
4,Tumbbad,2018,104,Horror,8.2,Rahi Anil Barve,53297,0,"Drama, Fantasy"


In [651]:
import plotly.graph_objects as go

# Count the frequency of each movie year
year_counts = movies['Movie Year'].value_counts()

# Sort the movie years in ascending order
sorted_years = sorted(year_counts.index)

# Create the area chart
fig = go.Figure(data=[
    go.Scatter(x=sorted_years, y=year_counts[sorted_years], fill='tozeroy', fillcolor='red')
])

# Set labels and title
fig.update_layout(
    xaxis=dict(title='Years'),
    yaxis=dict(title='Frequency'),
    title='Cumulative Distribution of Horror Movies Over the Years',
)

# Display the plot
fig.show()


In [652]:
import plotly.graph_objects as go

# Get the top 5 longest and shortest movies
top_longest_movies = movies.nlargest(5, 'Runtime')
top_shortest_movies = movies.nsmallest(5, 'Runtime')

# Combine the longest and shortest movies into a single DataFrame
combined_movies = top_longest_movies.append(top_shortest_movies)

# Create the grouped bar chart
fig = go.Figure(data=[
    go.Bar(
        name='Longest Movies',
        x=top_longest_movies['Movie Title'],
        y=top_longest_movies['Runtime'],
        marker_color='red'
    ),
    go.Bar(
        name='Shortest Movies',
        x=top_shortest_movies['Movie Title'],
        y=-top_shortest_movies['Runtime'],
        marker_color='black'
    )
])

fig.update_layout(
    barmode='group',
    title='Comparison of Runtime: Top 5 Longest vs Shortest Movies',
    xaxis_title='Movie Title',
    yaxis_title='Runtime (minutes)',
    showlegend=True
)

fig.show()


In [653]:
import plotly.graph_objects as go

# Get the Top 5 Best User Rated Movies in descending order
top_rated_movies = movies.nlargest(5, 'Rating')[::-1]

# Create a bar chart for the top 5 best movies
fig = go.Figure(data=go.Bar(
    x=top_rated_movies['Rating'],
    y=top_rated_movies['Movie Title'],
    orientation='h',
    marker_color='red'
))

fig.update_layout(
    title='Top 5 Best User Rated Movies',
    xaxis_title='Rating',
    yaxis_title='Movie Title'
)

fig.show()

# Top 5 Worst User Rated Movies in descending order
top_shortest_movies = movies.nsmallest(5, 'Rating').sort_values('Rating', ascending=False)

# Create a bar chart for the top 5 worst movies
fig = go.Figure(data=go.Bar(
    x=top_shortest_movies['Rating'],
    y=top_shortest_movies['Movie Title'],
    orientation='h',
    marker_color='red'
))

fig.update_layout(
    title='Top 5 Worst User Rated Movies',
    xaxis_title='Rating',
    yaxis_title='Movie Title'
)

fig.show()




In [654]:
# Split subgenres and explode to separate rows
movies['Subgenre'] = movies['Subgenre'].str.split(', ')
movies_exploded = movies.explode('Subgenre')

# Count the occurrence of each subgenre
subgenre_counts = movies_exploded['Subgenre'].value_counts().reset_index()
subgenre_counts.columns = ['Subgenre', 'Count']

In [660]:
# Create a bar chart
fig = go.Figure(data=go.Bar(
    x=subgenre_counts['Subgenre'],
    y=subgenre_counts['Count'],
    marker_color='red'
))

fig.update_layout(
    title='Count of Movies by Subgenre',
    xaxis_title='Subgenre',
    yaxis_title='Count'
)

fig.show()

In [659]:
# Explode the subgenres into separate rows
exploded_movies = movies.explode('Subgenre')

# Calculate the sum of gross by subgenre
subgenre_gross = exploded_movies.groupby('Subgenre')['Gross'].sum().reset_index()

# Sort the DataFrame by the gross in descending order
subgenre_gross = subgenre_gross.sort_values('Gross', ascending=False)

# Create a bar chart for the gross by subgenre
fig_gross = go.Figure(data=go.Bar(
    x=subgenre_gross['Subgenre'],
    y=subgenre_gross['Gross'],
    marker_color='red'
))

fig_gross.update_layout(
    title='Total Gross by Subgenre',
    xaxis_title='Subgenre',
    yaxis_title='Gross',
)

fig_gross.show()

In [671]:
# Create a histogram of ratings
fig = go.Figure(data=go.Histogram(x=movies['Rating'], nbinsx=22, marker_color='red'))

# Set labels and title
fig.update_layout(
    xaxis_title='Rating',
    yaxis_title='Frequency',
    title='Distribution of Ratings',
    xaxis_range=[1, 10]
)

# Display the plot
fig.show()

In [676]:
# Exclude movies with a gross of 0
non_zero_gross_movies = movies[movies['Gross'] != 0]

# Get the highest and lowest grossing movies
highest_grossing_movie = non_zero_gross_movies.nlargest(1, 'Gross')
lowest_grossing_movie = non_zero_gross_movies.nsmallest(1, 'Gross')

# Create the scatter plot
fig = go.Figure()

# Add the highest grossing movie as a scatter point
fig.add_trace(go.Scatter(
    x=[highest_grossing_movie['Movie Title'].values[0]],
    y=[highest_grossing_movie['Gross'].values[0]],
    mode='markers',
    name='Highest Grossing',
    marker=dict(color='green', size=10)
))

# Add the lowest grossing movie as a scatter point
fig.add_trace(go.Scatter(
    x=[lowest_grossing_movie['Movie Title'].values[0]],
    y=[lowest_grossing_movie['Gross'].values[0]],
    mode='markers',
    name='Lowest Grossing',
    marker=dict(color='red', size=10)
))

# Set labels and title
fig.update_layout(
    xaxis_title='Movie Title',
    yaxis_title='Gross Revenue',
    title='Highest Grossing vs Lowest Grossing Movie'
)

# Display the plot
fig.show()

In [678]:
# Group the movies by director and calculate the total gross for each director
director_gross = movies.groupby('Director')['Gross'].sum().nlargest(5)

# Create the bar chart
fig = go.Figure(data=go.Bar(
    x=director_gross.index,
    y=director_gross.values,
    marker_color='red'
))

# Set labels and title
fig.update_layout(
    title='Top 5 Directors with Highest Gross',
    xaxis_title='Director',
    yaxis_title='Total Gross'
)

# Display the plot
fig.show()

In [702]:
def print_movie_ratings(director_name):
    # Filter the movies directed by the specified director
    director_movies = movies[movies['Director'] == director_name]

    # Sort the movies by their ratings in descending order
    director_movies = director_movies.sort_values('Gross', ascending=False)

    # Print the movie titles and their ratings
    for index, row in director_movies.iterrows():
        print(f"Movie Title: {row['Movie Title']}, Gross: {row['Gross']}")

# Call the function for each director
directors = ['Andy Muschietti', 'Genndy Tartakovsky', 'Sam Raimi',
             'Jordan Peele', 'Wes Craven']

for director in directors:
    print(f"Movies directed by {director}:")
    print_movie_ratings(director)
    print()

Movies directed by Andy Muschietti:
Movie Title: It I, Gross: 327480000
Movie Title: It Chapter Two, Gross: 211590000
Movie Title: Mama I, Gross: 71630000

Movies directed by Genndy Tartakovsky:
Movie Title: Hotel Transylvania 2, Gross: 169700000
Movie Title: Hotel Transylvania 3: Summer Vacation, Gross: 167510000
Movie Title: Hotel Transylvania, Gross: 148310000

Movies directed by Sam Raimi:
Movie Title: Doctor Strange in the Multiverse of Madness, Gross: 411330000
Movie Title: Drag Me to Hell, Gross: 42100000
Movie Title: The Gift, Gross: 12010000
Movie Title: Army of Darkness, Gross: 11500000
Movie Title: Evil Dead II, Gross: 5920000
Movie Title: The Evil Dead, Gross: 2400000

Movies directed by Jordan Peele:
Movie Title: Get Out I, Gross: 176040000
Movie Title: Us II, Gross: 175080000
Movie Title: Nope, Gross: 123280000

Movies directed by Wes Craven:
Movie Title: Scream, Gross: 103050000
Movie Title: Scream 2, Gross: 101360000
Movie Title: Scream 3, Gross: 89140000
Movie Title: S

In [680]:
# Count the number of movies for each director
director_movie_count = movies['Director'].value_counts().nlargest(5)

# Create the bar chart
fig = go.Figure(data=go.Bar(
    x=director_movie_count.index,
    y=director_movie_count.values,
    marker_color='red'
))

# Set labels and title
fig.update_layout(
    title='Top 5 Directors with Most Horror Movies',
    xaxis_title='Director',
    yaxis_title='Number of Movies'
)

# Display the plot
fig.show()

In [683]:
# Get the top 5 directors with the most horror movies
top_directors = movies['Director'].value_counts().nlargest(5).index.tolist()

# Filter the movies to include only the top directors
top_director_movies = movies[movies['Director'].isin(top_directors)]

# Group the movies by director and calculate the sum of gross earnings
director_gross = top_director_movies.groupby('Director')['Gross'].sum()

# Create the grouped bar chart
fig = go.Figure()

for director in top_directors:
    director_movies = top_director_movies[top_director_movies['Director'] == director]
    fig.add_trace(go.Bar(
        x=[director],
        y=[director_movies['Gross'].sum()],
        name=director
    ))

# Set labels and title
fig.update_layout(
    title='Top 5 Directors: Horror Movie Count and Gross Earnings',
    xaxis_title='Director',
    yaxis_title='Gross Earnings'
)

# Display the plot
fig.show()








In [701]:
def print_movie_ratings(director_name):
    # Filter the movies directed by the specified director
    director_movies = movies[movies['Director'] == director_name]

    # Sort the movies by their ratings in descending order
    director_movies = director_movies.sort_values('Gross', ascending=False)

    # Print the movie titles and their ratings
    for index, row in director_movies.iterrows():
        print(f"Movie Title: {row['Movie Title']}, Gross: {row['Gross']}")

# Call the function for each director
directors = ['John Carpenter', 'Wes Craven', 'David cronenberg',
             'James Wan', 'Guillermo del Toro']

for director in directors:
    print(f"Movies directed by {director}:")
    print_movie_ratings(director)
    print()

Movies directed by John Carpenter:
Movie Title: Halloween, Gross: 47000000
Movie Title: The Fog, Gross: 21380000
Movie Title: Christine, Gross: 21200000
Movie Title: Vampires, Gross: 20240000
Movie Title: Prince of Darkness, Gross: 14180000
Movie Title: The Thing, Gross: 13780000
Movie Title: They Live, Gross: 13010000
Movie Title: Village of the Damned, Gross: 9420000
Movie Title: In the Mouth of Madness, Gross: 8950000
Movie Title: Ghosts of Mars, Gross: 8430000
Movie Title: The Ward, Gross: 0

Movies directed by Wes Craven:
Movie Title: Scream, Gross: 103050000
Movie Title: Scream 2, Gross: 101360000
Movie Title: Scream 3, Gross: 89140000
Movie Title: Scream 4, Gross: 38180000
Movie Title: A Nightmare on Elm Street, Gross: 25500000
Movie Title: The Hills Have Eyes, Gross: 25000000
Movie Title: The People Under the Stairs, Gross: 24200000
Movie Title: The Serpent and the Rainbow, Gross: 19600000
Movie Title: Cursed, Gross: 19300000
Movie Title: New Nightmare, Gross: 18090000
Movie Ti

In [686]:
# Get the top 5 directors with the highest rated movies
top_directors = movies.groupby('Director')['Rating'].mean().nlargest(5).index.tolist()

# Filter the movies to include only the top directors
top_director_movies = movies[movies['Director'].isin(top_directors)]

# Calculate the average rating for each director
director_avg_rating = top_director_movies.groupby('Director')['Rating'].mean()

# Create the bar plot
fig = go.Figure(data=go.Bar(
    x=top_directors,
    y=director_avg_rating,
    marker_color='red'
))

# Set labels and title
fig.update_layout(
    title='Top 5 Directors: Average Movie Rating',
    xaxis=dict(
        title='Director',
        tickangle=45
    ),
    yaxis=dict(
        title='Average Rating'
    )
)

# Display the plot
fig.show()

In [700]:
def print_movie_ratings(director_name):
    # Filter the movies directed by the specified director
    director_movies = movies[movies['Director'] == director_name]

    # Sort the movies by their ratings in descending order
    director_movies = director_movies.sort_values('Rating', ascending=False)

    # Print the movie titles and their ratings
    for index, row in director_movies.iterrows():
        print(f"Movie Title: {row['Movie Title']}, Rating: {row['Rating']}")

# Call the function for each director
directors = ['Marwan Hamed', 'Alfred Hitchcock', 'Henri-Georges Clouzot',
             'Rahi Anil Barave', 'Stanley Kubrick']

for director in directors:
    print(f"Movies directed by {director}:")
    print_movie_ratings(director)
    print()


Movies directed by Marwan Hamed:
Movie Title: The Blue Elephant, Rating: 8.0

Movies directed by Alfred Hitchcock:
Movie Title: Psycho, Rating: 8.5
Movie Title: The Birds, Rating: 7.6

Movies directed by Henri-Georges Clouzot:
Movie Title: Diabolique, Rating: 8.1

Movies directed by Rahi Anil Barave:

Movies directed by Stanley Kubrick:
Movie Title: The Shining, Rating: 8.4

