In [2]:
import os
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import seaborn as sns
from collections import Counter, defaultdict
import community.community_louvain as community_louvain
import numpy as np
import re

In [3]:
dir_path = "C:/Users/nerea/Documents/MasterDTU/SocialGraphs_fall24/Projects/socialGraphs_fall24/net_movies_info/net_movies_info"
csv = "C:/Users/nerea/Documents/MasterDTU/SocialGraphs_fall24/Projects/socialGraphs_fall24/FinalProject/csv/unified_dictionary_percent.csv"
df = pd.read_csv(csv)

In [4]:
def extract_synopsis(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Use a regular expression to extract the synopsis (assuming it's between "Synopsis:" and "]")
    match = re.search(r'Synopsis:\s*\[(.*?)\]', content, re.DOTALL)
    if match:
        synopsis = match.group(1)
        return synopsis
    else:
        return ""

# Get a list of all files in the directory
files = os.listdir(dir_path)

# Extract synopses from each file
synopses = {}
for file in files:
    file_path = os.path.join(dir_path, file)
    if os.path.isfile(file_path):
        synopsis = extract_synopsis(file_path)
        # You can associate the synopsis with the movie name, here we use the file name as the movie name
        movie_name = os.path.splitext(file)[0]
        synopses[movie_name] = synopsis

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Clean all synopses
for movie, synopsis in synopses.items():
    synopses[movie] = clean_text(synopsis)

In [5]:
def calculate_genre_percentages(synopses, keyword_genre_mapping):
    # Clean the synopsis and split into words
    cleaned_synopsis = clean_text(synopsis)
    words = cleaned_synopsis.split()  # All words in the synopsis (not unique)
    #words = set(cleaned_synopsis.split())  # Unique words in the synopsis
    
    genre_counts = Counter()  # To store genre counts
    total_keywords = 0  # To store the total number of genre-related keywords found
    genre_percentages = []
    # For each word in the synopsis
    for word in words:
        if word in keyword_genre_mapping:  # If the word is in the mapping
            for genre, value in keyword_genre_mapping[word].items():
                genre_counts[genre] += value  # Add the genre contribution

    # generate percentages
    total = sum(genre_counts.values())
    for genre, count in genre_counts.items():
        genre_percentages.append((genre, (count/total)*100))
        
    return genre_percentages

In [6]:
# Convert the DataFrame into a dictionary for quick lookups
keyword_genre_mapping = df.set_index('Keyword').to_dict(orient='index')

movie_genre_percentages = {}

for movie_name, synopsis in synopses.items():
    genre_percentages = calculate_genre_percentages(synopsis, keyword_genre_mapping)
    movie_genre_percentages[movie_name] = genre_percentages

In [7]:
def save_csv_percentages(movie_genre_percentages, output_file):
    data = []
    for movie_name, genre_percentages in movie_genre_percentages.items():
        # Sort genres by percentage in descending order
        sorted_genres = sorted(genre_percentages, key=lambda x: x[1], reverse=True)
        
        # Prepare data for saving
        data.append({
            "Movie": movie_name,
            "Top Genre": sorted_genres[0][0] if sorted_genres else None,  # Highest percentage genre
            "Top Percentage": sorted_genres[0][1] if sorted_genres else 0,  # Highest percentage
            "All Genres": ", ".join([f"{genre}: {percentage:.2f}%" for genre, percentage in sorted_genres])  # Full list
        })
    
    # Convert to DataFrame and save
    result_df = pd.DataFrame(data)
    result_df.to_csv(output_file, index=False)
    print(f"Genre percentages saved to {output_file}")
    

In [8]:
# Run the function and save the results
print(type(movie_genre_percentages))
output_file = "movie_genre_percentages_top_prueba.csv"
save_csv_percentages(movie_genre_percentages, output_file)

<class 'dict'>
Genre percentages saved to movie_genre_percentages_top_prueba.csv


BOTTOM MOVIES

In [11]:
dir_path = "C:/Users/nerea/Documents/MasterDTU/SocialGraphs_fall24/Projects/socialGraphs_fall24/bottom_net_movies_info"
csv = "C:/Users/nerea/Documents/MasterDTU/SocialGraphs_fall24/Projects/socialGraphs_fall24/FinalProject/csv/unified_dictionary_percent.csv"
df = pd.read_csv(csv)

# Get a list of all files in the directory
files = os.listdir(dir_path)

# Extract synopses from each file
synopses = {}
for file in files:
    file_path = os.path.join(dir_path, file)
    if os.path.isfile(file_path):
        synopsis = extract_synopsis(file_path)
        # You can associate the synopsis with the movie name, here we use the file name as the movie name
        movie_name = os.path.splitext(file)[0]
        synopses[movie_name] = synopsis

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Clean all synopses
for movie, synopsis in synopses.items():
    synopses[movie] = clean_text(synopsis)

# Convert the DataFrame into a dictionary for quick lookups
keyword_genre_mapping = df.set_index('Keyword').to_dict(orient='index')

movie_genre_percentages = {}

for movie_name, synopsis in synopses.items():
    genre_percentages = calculate_genre_percentages(synopsis, keyword_genre_mapping)
    movie_genre_percentages[movie_name] = genre_percentages

output_file = "movie_genre_percentages_bottom_prueba.csv"
save_csv_percentages(movie_genre_percentages, output_file)

Genre percentages saved to movie_genre_percentages_bottom_prueba.csv
