# Similarity calculation

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import ast
from datasketch import MinHash, MinHashLSH

## Dataset Preprocessing (Paula's part)

Load the dataset that Raquel processed (joining a few different datasets and adding the sentiment analysis)

In [None]:
main_complete = pd.read_csv('data/main_complete.csv')
main_complete_og = main_complete.copy()
main_complete.head()

To calculate the similarity, we need to do minhashing, that is why, a tokenization is first needed. We wanted to take several features for calculating the jaccard similarity:
- genre
- cast
- crew
- sentiment
- platforms (one-hot encoded)
- crew
- production companies
- production countries
- imbd votes
- imbd rating

To tokenize the features, we created sets for each of them with all the strings of each category inside

### Genres

In [None]:
# check how many movies have no genre
nulls = main_complete['genres'].isnull().sum()
print("This percentage of movies have no genre:", nulls/len(main_complete)*100, "%")

In [None]:
# split the genres into a set of genres and remove spaces
main_complete['genres'] = main_complete['genres'].apply(lambda x: set(genre.strip() for genre in x.split(',')) if pd.notnull(x) else set())

We also did a small analysis of the genres to know with what we were working with

In [None]:
# check how many different genres are taking into account the content of the lists
genres = set()
for list in main_complete['genres']:
    # if list is not NaN
    if list is not np.nan:
        for genre in list:
            # remove the space at the beginning of the string if there is one
            genre = genre.strip()
            genres.add(genre)
print("There are ", len(genres), "different genres in the dataset")
print("The genres are:", genres)

genres_count = {genre: 0 for genre in genres}
for genre_list in main_complete['genres']:
    if genre_list is not np.nan:
        for genre in genre_list:
            genre = genre.strip()
            genres_count[genre] += 1

# histogram
plt.figure(figsize=(20, 10))
plt.bar(genres_count.keys(), genres_count.values())
plt.xticks(rotation=90)
plt.xlabel('Genres')
plt.ylabel('Number of movies')
plt.title('Number of movies per genre')
plt.show()

### Cast and crew

We reproduced the same procedure that we did for the genres. In this case, for the crew, we joined the columns of director and producers.

In [None]:
# split the cast into a set of cast and remove spaces
main_complete['cast'] = main_complete['cast'].apply(lambda x: set(actor.strip() for actor in x.split(',')) if pd.notnull(x) else set())

In [None]:
# how many movies have no cast
nulls = main_complete['cast'].apply(lambda x: len(x) == 0).sum()
print("This percentage of movies have no cast:", nulls/len(main_complete)*100, "%")

print(nulls)

In [None]:
# Join the director and producers columns into a single column crew
columns_to_merge = ['director', 'producers']

for column in columns_to_merge:
    main_complete[column] = main_complete[column].apply(lambda x: set(person.strip() for person in x.split(',')) if pd.notnull(x) else set())

main_complete['crew'] = main_complete.apply(lambda row: set.union(row['director'], row['producers']), axis=1)

# Drop the individual columns
main_complete.drop(columns=columns_to_merge, inplace=True)

In [None]:
# how many movies have no crew
nulls = main_complete['crew'].apply(lambda x: len(x) == 0).sum()
print("This percentage of movies have no crew:", nulls/len(main_complete)*100, "%")

### Production companies and countries

The same procedure was followed with these two features

In [None]:
# split the production_companies into a set and remove spaces
main_complete['production_companies'] = main_complete['production_companies'].apply(lambda x: set(company.strip() for company in x.split(',')) if pd.notnull(x) else set())
main_complete['production_companies'] = main_complete['production_companies'].apply(lambda x: x - {''})

In [None]:
# percentage of movies with no production companies
nulls = main_complete['production_companies'].apply(lambda x: len(x) == 0).sum()
print("This percentage of movies have no production companies:", nulls/len(main_complete)*100)

In [None]:
main_complete['production_countries'] = main_complete['production_countries'].apply(lambda x: set(country.strip() for country in x.split(',')) if pd.notnull(x) else set())
main_complete['production_countries'] = main_complete['production_countries'].apply(lambda x: x - {''})

In [None]:
# percentage of movies with no production countries
nulls = main_complete['production_countries'].apply(lambda x: len(x) == 0).sum()
print("This percentage of movies have no production countries:", nulls/len(main_complete)*100)

### Sentiment analysis

After calculating the sentiment for each movie, to take it into account for the minhashing, we decided to create three categories depending on the values. This was done due to the fact that dor this method, integers don't work as good as strings (TO REVIEW).

To find the thresholds we ...

In [None]:
sentiment_min = main_complete['sentiment'].min()
sentiment_max = main_complete['sentiment'].max()
sentiment_mean = main_complete['sentiment'].mean()
sentiment_std = main_complete['sentiment'].std()
sentiment_mid = main_complete['sentiment'].median()

print(f"Range of sentiment values: {sentiment_min} to {sentiment_max} with mean {sentiment_mean} and standard deviation {sentiment_std} and median {sentiment_mid}")

In [None]:
plt.hist(df['sentiment'], bins=20)
plt.axvline(df['sentiment'].mean(), color='r', linestyle='dashed', linewidth=1, label='Mean Sentiment')
plt.legend()
plt.show()

In [None]:
# Define the thresholds for sentiment ranges
low_threshold = sentiment_mean - sentiment_std
high_threshold = sentiment_mean + sentiment_std

# Function to categorize sentiment
def categorize_sentiment(value):
    if value < low_threshold:
        return 'low'
    elif value > high_threshold:
        return 'high'
    else:
        return 'medium'

# Assuming main_complete is already defined and categorize_sentiment is a function
main_complete.loc[:, 'sentiment_category'] = main_complete['sentiment'].apply(categorize_sentiment)

# Display the updated dataframe
main_complete[['title', 'sentiment', 'sentiment_category']].head()

### Dataset cleaning

Finally, before doing the minhashing, some movies were deleted from the dataset because they had more than 4 interested features empty and because they were not "Released".

In [None]:
# clean movies with 3 or more interesting columns empty
columns = ['genres', 'cast', 'crew', 'production_companies', 'production_countries', 'sentiment_category']

initial_count = len(main_complete)

# check each row if they have 3 or more columns from columns with empty sets
main_complete = main_complete[~(main_complete[columns].apply(lambda x: sum(1 for i in x if len(i) == 0) >= 4, axis=1))]
final_count = len(main_complete)

print("Number of movies removed: ", initial_count - final_count, "from a total of ", initial_count)
print("Percentage of movies removed: ", (initial_count - final_count) / initial_count * 100)

In [None]:
# remove movies if the 'status' is not 'Released'
initial_count = len(main_complete)
main_complete = main_complete[main_complete['status'] == 'Released']
final_count = len(main_complete)

print("Number of movies removed: ", initial_count - final_count, "from a total of ", initial_count)
print("Percentage of movies removed: ", (initial_count - final_count) / initial_count * 100)

## Minhashing

Load the small dataset that is the result after cleaning and processing it.

In [2]:
# load the small dataset
data = pd.read_csv('data/small_main_complete.csv')

In [3]:
data.head()

Unnamed: 0,id,title,genres,sentiment_category,Netflix,Amazon,Hulu,Apple,HBO,crew,production_companies,production_countries,imdb_votes,imdb_rating
0,2,Ariel,"{'Romance', 'Crime', 'Comedy', 'Drama'}",medium,0.0,1.0,1.0,0.0,0.0,{'Aki Kaurismäki'},{'Villealfa Filmproductions'},{'Finland'},8735.0,7.4
1,3,Shadows in Paradise,"{'Romance', 'Comedy', 'Drama'}",medium,0.0,1.0,1.0,0.0,0.0,"{'Mika Kaurismäki', 'Aki Kaurismäki'}",{'Villealfa Filmproductions'},{'Finland'},7484.0,7.5
2,5,Four Rooms,{'Comedy'},medium,0.0,1.0,0.0,1.0,0.0,"{'Robert Rodriguez', 'Alexandre Rockwell', 'Al...","{'Miramax', 'A Band Apart'}",{'United States of America'},112484.0,6.7
3,6,Judgment Night,"{'Crime', 'Action', 'Thriller'}",medium,0.0,1.0,0.0,0.0,0.0,"{'Marilyn Vance', 'Gene Levy', 'Stephen Hopkin...","{'JVC', 'Largo Entertainment', 'Universal Pict...",{'United States of America'},19268.0,6.6
4,8,Life in Loops (A Megacities RMX),{'Documentary'},high,0.0,0.0,0.0,0.0,0.0,"{'Ulrich Gehmacher', 'Timo Novotny'}",{'inLoops'},{'Austria'},284.0,8.2
