In [None]:
import pandas as pd
import numpy as np
import ast # for parsing strings into dictionaries

In [None]:
movies_metadata_file_path = '/kaggle/input/movie-genre/movies_metadata.csv'
movies_df = pd.read_csv(movies_metadata_file_path, low_memory=False)
print(movies_df.shape)

In [None]:
# Show example data
print(movies_df.iloc[0])

In [None]:
print("Values count adult column:")
print(movies_df['adult'].value_counts(dropna=False))  # include NaN counts

In [None]:
# Show bad data
invalid_adult_values_df = movies_df[~movies_df['adult'].isin(['True', 'False'])]

print("Rows with invalid 'adult' values:")
invalid_adult_values_df.head()

In [None]:
# Filter out adult films, small sample size
movies_df = movies_df[movies_df['adult'] == 'False']

In [None]:
# Show genres column, this is string representation
print(movies_df.iloc[0]['genres'])

In [None]:
def genre_transform(x):
    try:
        return [genre['name'] for genre in ast.literal_eval(x)] if pd.notnull(x) else []
    except Exception as e:
        print(f"Error for input {x}: {e}")
        return []

In [None]:
movies_df['transformed_genres'] = movies_df['genres'].apply(genre_transform)

In [None]:
movies_df[['genres', 'transformed_genres']].head()

In [None]:
# Count movies with NaN overview
nan_overview_count = movies_df['overview'].isna().sum()
print(f"Number of movies with NaN overview: {nan_overview_count}")

In [None]:
# Filter out movies with NaN overview
movies_df = movies_df[~movies_df['overview'].isna()]
movies_df = movies_df.reset_index(drop=True)

In [None]:
# Count movies with None overview
none_overview_count = (movies_df['overview'] == None).sum()

print(f"Number of movies with None overview: {none_overview_count}")

In [None]:
# Strip whitespace characters from the 'overview' column
movies_df['overview'] = movies_df['overview'].str.strip()

In [None]:
# Count movies with empty string overview
empty_string_overview_count = (movies_df['overview'] == '').sum()
print(f"Number of movies with empty string overview: {empty_string_overview_count}")

In [None]:
# Filter out rows with empty string overviews
movies_df = movies_df[movies_df['overview'] != '']
movies_df = movies_df.reset_index(drop=True)

In [None]:
movies_df['genre_count'] = movies_df['transformed_genres'].apply(len)
movies_df['has_duplicates'] = movies_df['transformed_genres'].apply(lambda x: len(x) != len(set(x)))

In [None]:
# show there are no movies with duplicate genres
movies_df[movies_df['has_duplicates'] == True].head()

In [None]:
# filter out movies without genre
movies_df = movies_df[movies_df['genre_count'] != 0]
movies_df = movies_df.reset_index(drop=True)

## Genre overview

In [None]:
# Genre occurences
from collections import Counter

all_genres = [genre for sublist in movies_df['transformed_genres'] for genre in sublist]
genre_counts = Counter(all_genres)

sorted_genres = sorted(genre_counts.items(), key=lambda x: x[1], reverse=True)


for genre, count in sorted_genres:
    print(f"{genre}: {count}")

In [None]:
from collections import Counter
genre_sets = movies_df['transformed_genres'].apply(lambda x: tuple(sorted(x)))
genre_set_counts = Counter(genre_sets)

sorted_genre_set_counts = sorted(genre_set_counts.items(), key=lambda x: x[1], reverse=True)

print("Counts of Unique Genre Combinations:")
for genres, count in sorted_genre_set_counts[:10]:
    print(f"{genres}: {count}")

## Length exploration

In [None]:
# Overview length statistics
movies_df['overview_length'] = movies_df['overview'].astype(str).apply(len)

print("\nOverview length statistics:")
print(f"Average length: {movies_df['overview_length'].mean()}")
print(f"Minimum length: {movies_df['overview_length'].min()}")
print(f"Maximum length: {movies_df['overview_length'].max()}")

In [None]:
short_overview_movies = movies_df[movies_df['overview_length'] <= 50]

short_overview_movies['overview'].head()

In [None]:
# filter out short overview movies
movies_df = movies_df[movies_df['overview_length'] > 50]
movies_df = movies_df.reset_index(drop=True)

In [None]:
movies_df['overview_word_count'] = movies_df['overview'].astype(str).apply(lambda x: len(x.split()))

print("\nOverview word count statistics:")
print(f"Average word count: {movies_df['overview_word_count'].mean()}")
print(f"Minimum word count: {movies_df['overview_word_count'].min()}")
print(f"Maximum word count: {movies_df['overview_word_count'].max()}")


In [None]:
# Total number of movies
total_movies = len(movies_df)

# Number of movies with 17 words or more in their overview
movies_with_17_words_or_more = len(movies_df[movies_df['overview_word_count'] >= 17])

# Calculate the percentage
percentage = (movies_with_17_words_or_more / total_movies) * 100

print(f"Percentage of movies with 17 words or more in overview: {percentage:.2f}%")

In [None]:
import matplotlib.pyplot as plt

genre_series = pd.Series(all_genres)
genre_counts_sorted = genre_series.value_counts().sort_values(ascending=True)


plt.figure(figsize=(10, 8))
plt.barh(genre_counts_sorted.index, genre_counts_sorted.values, color='skyblue')
plt.xlabel('Number of Movies')
plt.ylabel('Genre')
plt.title('Movie Genre Distribution')
plt.show()


In [None]:
import matplotlib.pyplot as plt

genre_count_series = movies_df['genre_count']

genre_count_counts = genre_count_series.value_counts().sort_index()

plt.figure(figsize=(10, 8))
plt.bar(genre_count_counts.index, genre_count_counts.values, color='skyblue')
plt.xlabel('Number of Genres')
plt.ylabel('Number of Movies')
plt.title('Number of Movies per Genre Count')
plt.xticks(genre_count_counts.index) 
plt.grid(axis='y')
plt.show()


In [None]:
plt.figure(figsize=(10, 8))
plt.bar(genre_count_counts.index, genre_count_counts.values, color='skyblue')
plt.xlabel('Number of Genres')
plt.ylabel('Number of Movies')
plt.title('Number of Movies per Genre Count')
plt.xticks(genre_count_counts.index)  
plt.yscale('log') 
plt.grid(axis='y')
plt.show()

In [None]:
print(genre_count_counts)

In [None]:
for column in movies_df.columns:
    print(column)

In [None]:
columns_to_keep = [
    'id', 'imdb_id', 'original_language', 'overview', 'poster_path', 'title',
    'tagline', 'vote_average', 'vote_count', 'transformed_genres',
    'genre_count', 'overview_length', 'overview_word_count'
]


movies_df = movies_df[columns_to_keep]

In [None]:
file_path = '/kaggle/working/movies_metadata_filtered.csv'

movies_df.to_csv(file_path, index=False)