In [None]:
import pandas as pd
import json

df = pd.read_csv('The fittest about my owl.csv')

print(f"Total records: {len(df)}")

mismatched = []
for idx, row in df.iterrows():
    title1 = str(row['Title']).lower().strip() if pd.notna(row['Title']) else ''
    title2 = str(row['title']).lower().strip() if pd.notna(row['title']) else ''
    
    if title1 and title2 and title1 != title2:
        words1 = set(title1.split())
        words2 = set(title2.split())
        similarity = len(words1.intersection(words2)) / max(len(words1), len(words2))
        
        if similarity < 0.5:
            mismatched.append({
                'Row': idx + 1,
                'Your_Movie': row['Title'],
                'TMDB_Movie': row['title'],
                'Similarity': similarity
            })

print(f"Found {len(mismatched)} mismatched movies:")
for m in mismatched[:5]:
    print(f"Row {m['Row']}: {m['Your_Movie']} → {m['TMDB_Movie']} (Similarity: {m['Similarity']:.2f})")

df_basic = pd.read_csv('movie.data.txt', sep='\t', encoding='utf-16')
df_tmdb = pd.read_csv('tmdb_5000_movies.csv')

def find_correct_match(target_title, tmdb_df):
    target_lower = str(target_title).lower().strip()
    
    best_match = None
    best_score = 0
    
    for _, tmdb_row in tmdb_df.iterrows():
        tmdb_titles = []
        for col in ['title', 'original_title']:
            if col in tmdb_row and pd.notna(tmdb_row[col]):
                tmdb_titles.append(str(tmdb_row[col]).lower().strip())
        
        for tmdb_title in tmdb_titles:
            if target_lower == tmdb_title:
                return tmdb_row, 1.0
            
            target_words = set(target_lower.split())
            tmdb_words = set(tmdb_title.split())
            common = target_words.intersection(tmdb_words)
            
            if common:
                score = len(common) / max(len(target_words), len(tmdb_words))
                if score > best_score:
                    best_score = score
                    best_match = tmdb_row
    
    return best_match, best_score

print("Creating correct merged dataset...")

correct_matches = []
for idx, row in df_basic.iterrows():
    movie_title = row['Title']
    print(f"Matching: {movie_title}")
    
    match, score = find_correct_match(movie_title, df_tmdb)
    
    if match is not None and score > 0.6:
        merged_record = {
            'Movie #': row['Movie #'],
            'Title_basic': row['Title'],
            'Year_basic': row['Year'],
            'Duration_basic': row['Duration'],
            'Directors_basic': row['Directors'],
            'Actors_basic': row['Actors'],
            'Genres_basic': row['Genres'],
            'Plot_Summary': row['Plot Summary'],
            'Title_TMDB': match.get('title', ''),
            'Release_Date': match.get('release_date', ''),
            'Budget': match.get('budget', ''),
            'Revenue': match.get('revenue', ''),
            'Runtime': match.get('runtime', ''),
            'Vote_Average': match.get('vote_average', ''),
            'Vote_Count': match.get('vote_count', ''),
            'Overview': match.get('overview', ''),
            'Match_Score': score,
            'Match_Status': 'Matched' if score > 0.7 else 'Partial'
        }
        correct_matches.append(merged_record)
        print(f"Found: {match.get('title', 'N/A')} (Similarity: {score:.2f})")
    else:
        print(f"No match found (Best similarity: {score:.2f})")

if correct_matches:
    df_correct = pd.DataFrame(correct_matches)
    df_correct.to_csv('correctly_matched_movies.csv', index=False, encoding='utf-8-sig')
    print(f"Saved {len(correct_matches)} correctly matched movies to: correctly_matched_movies.csv")
else:
    print("No sufficient matches found")

Total records: 20
Found 19 mismatched movies:
Row 1: The Shawshank Redemption → The Dark Knight (Similarity: 0.33)
Row 2: The Godfather → Inception (Similarity: 0.00)
Row 3: The Dark Knight → The Lord of the Rings: The Fellowship of the Ring (Similarity: 0.17)
Row 4: The Godfather Part II → The Lord of the Rings: The Return of the King (Similarity: 0.17)
Row 5: 12 Angry Men → Seventh Son (Similarity: 0.00)
Creating correct merged dataset...
Matching: The Shawshank Redemption
Found: The Shawshank Redemption (Similarity: 1.00)
Matching: The Godfather
Found: The Godfather (Similarity: 1.00)
Matching: The Dark Knight
Found: The Dark Knight (Similarity: 1.00)
Matching: The Godfather Part II
Found: The Hangover Part II (Similarity: 0.75)
Matching: 12 Angry Men
Found: 12 Angry Men (Similarity: 1.00)
Matching: Schindler's List
Found: Schindler's List (Similarity: 1.00)
Matching: The Lord of the Rings: The Return of the King
Found: The Lord of the Rings: The Return of the King (Similarity: 1.00