# Match the CSV titles in order to eradicate any duplicate values and to narrow down our dataset

In [None]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Replace these filenames with the actual paths to your CSV files
csv_file1 = 'ScrapedCSVs/top_100_movies.csv'
csv_file2 = 'ScrapeLOC/movies.csv'
csv_file3 = 'ScrapeRottenTomatoes/moviesRT.csv'

# Read each CSV into separate DataFrames
df1 = pd.read_csv(csv_file1)
df2 = pd.read_csv(csv_file2)
df3 = pd.read_csv(csv_file3)

# Function to find fuzzy matches using FuzzyWuzzy
def fuzzy_match(movie_title, titles_list):
    return process.extractOne(movie_title, titles_list)[0]

# Get a list of all movie titles in all three DataFrames
all_movie_titles = df1['Title'].tolist() + df2['Title'].tolist() + df3['Title'].tolist()

# Find fuzzy matches for each DataFrame
df1['Matched_Title'] = df1['Title'].apply(fuzzy_match, args=(all_movie_titles,))
df2['Matched_Title'] = df2['Title'].apply(fuzzy_match, args=(all_movie_titles,))
df3['Matched_Title'] = df3['Title'].apply(fuzzy_match, args=(all_movie_titles,))

# Concatenate the DataFrames based on fuzzy matches
combined_df = pd.concat([df1, df2, df3], ignore_index=True)

# Write the combined DataFrame to a new CSV file
output_csv = 'combined_movies_fuzzy.csv'
combined_df.to_csv(output_csv, index=False)

print(f"Fuzzy data matching completed. The combined data is saved to {output_csv}.")

In [None]:
import pandas as pd
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Replace these filenames with the actual paths to your CSV files
csv_file1 = 'ScrapedCSVs/top_100_movies.csv'
csv_file2 = 'ScrapeLOC/movies.csv'
csv_file3 = 'ScrapeRottenTomatoes/moviesRT.csv'

# Read each CSV into separate DataFrames
df1 = pd.read_csv(csv_file1)
df2 = pd.read_csv(csv_file2)
df3 = pd.read_csv(csv_file3)

# Function to find fuzzy matches using Jaccard similarity
def fuzzy_match_jaccard(movie_title, titles_list):
    return process.extractOne(movie_title, titles_list, scorer=fuzz.token_sort_ratio)[0]

# Get a list of all movie titles in all three DataFrames
all_movie_titles = df1['Title'].tolist() + df2['Title'].tolist() + df3['Title'].tolist()

# Find fuzzy matches for each DataFrame
df1['Matched_Title'] = df1['Title'].apply(fuzzy_match_jaccard, args=(all_movie_titles,))
df2['Matched_Title'] = df2['Title'].apply(fuzzy_match_jaccard, args=(all_movie_titles,))
df3['Matched_Title'] = df3['Title'].apply(fuzzy_match_jaccard, args=(all_movie_titles,))

# Concatenate the DataFrames based on fuzzy matches
combined_df = pd.concat([df1, df2, df3], ignore_index=True)

# Write the combined DataFrame to a new CSV file
output_csv = 'combined_movies_jaccard.csv'
combined_df.to_csv(output_csv, index=False)

print(f"Jaccard similarity matching completed. The combined data is saved to {output_csv}.")