In [None]:
import pandas as pd
import numpy as np
import random
import nltk
import re
from tqdm.notebook import tqdm
import requests
from bs4 import BeautifulSoup
import time
from difflib import SequenceMatcher

In [None]:
# Let's find out actual lyrics of a new album 
songs = ['The Fate Of Ophelia', 'Elizabeth Taylor', 'Opalite', 'Father Figure', 'Eldest Daughter', 'Ruin The Friendship', 
        'Actually Romantic', 'wihlit', 'Wood', 'CANCELLED', 'Honey', 'The Life Of A Showgirl']

lyrics = {}

for song in tqdm(songs, desc="Fetching lyrics"):
    s_lower = song.replace(" ", "").lower()
    try:
        page_url = f'https://www.azlyrics.com/lyrics/taylorswift/{s_lower}.html'
        page = requests.get(page_url)
        html = BeautifulSoup(page.text, "html.parser")
        
        ringtone_div = html.find('div', class_='ringtone')
        if ringtone_div is None:
            print(f'Lyrics not found for {song}')
            continue
        
        song_title_tag = ringtone_div.find_next('b')
        if song_title_tag is None:
            print(f'Lyrics not found for {song}')
            continue

        lyrics_div = song_title_tag.find_next('div')
        lyric = re.sub(r'\n+', '\n', lyrics_div.get_text(separator="\n").strip()).strip()  

        lyrics[song] = lyric  
        
        time.sleep(np.random.uniform(15, 25))
    except Exception as e:
        print(f'Error for {song}: {e}')

Fetching lyrics:   0%|          | 0/12 [00:00<?, ?it/s]

In [15]:
phrases_approach_1 = ["And you come up and I'm just like", "People went down like", "Like I play instruments", 
                      "It's just like it's just like, it's about it", "But it's like I just didn't know this",
                      "He's just like he's just a maniac", "I just love it", "And he's like, I got you", 
                      "So it's like, we're going, we're going out.", "And you know what?"]
phrases_approach_2 = ["But you did it tonight", "So we got all...", "Do you know what I mean?", 
                      "You won't be able to get to bed till four in the morning after this.",
                      "The Life of a Showgirl featuring Sabrina Carpenter.",
                      "This represents the end of my night.", "Life is more upbeat.", "And there's, there's a poem in this.",
                      "And my day ends with me in a bathtub, not usually in a bedazzled dress.", "You got two more in a row.",
                      "Track seven, Actually Romantic."]

In [None]:
def normalize_text(text):
    """Normalize text for comparison - remove punctuation, lowercase"""
    return re.sub(r'[^\w\s]', '', text.lower()).strip()

def check_phrases_in_lyrics_improved(lyrics_dict, phrases_to_check, threshold=0.7):
    """
    Check if predicted phrases appear in actual lyrics
    
    Args:
        lyrics_dict: Dictionary with song lyrics
        phrases_to_check: List of predicted phrases
        threshold: Minimum similarity score for partial matches
    """

    results = {
        'exact_matches': [],
        'partial_matches': [],
        'no_matches': []
    }
    
    for phrase in phrases_to_check:
        phrase_norm = normalize_text(phrase)
        phrase_words = phrase_norm.split()
        found = False
        best_match = None
        best_similarity = 0
        
        for song_title, song_lyrics in lyrics_dict.items():
            if not song_lyrics:
                continue
            
            lines = song_lyrics.split('\n')
            
            for line in lines:
                line_norm = normalize_text(line)
                
                # Check exact substring match
                if phrase_norm in line_norm:
                    results['exact_matches'].append({
                        'phrase': phrase,
                        'song': song_title,
                        'line': line.strip(),
                        'match_type': 'exact'
                    })
                    found = True
                    print(f"✅ EXACT: '{phrase}' in '{song_title}'")
                    print(f"   Line: '{line.strip()[:60]}...'")
                    break
                
                # Partial match check
                # 1. Check if the phrase is part of the string
                if len(phrase_words) >= 3:  # For phrases with 3+ words
                    if phrase_norm in line_norm:
                        similarity = 1.0
                    else:
                        # Word intersection check
                        line_words = line_norm.split()
                        common_words = set(phrase_words) & set(line_words)
                        if len(common_words) >= len(phrase_words) * 0.6:  # 60% of the words match
                            similarity = len(common_words) / len(phrase_words)
                        else:
                            similarity = SequenceMatcher(None, phrase_norm, line_norm).ratio()
                else:
                    similarity = SequenceMatcher(None, phrase_norm, line_norm).ratio()
                
                # Saving the best match
                if similarity > best_similarity and similarity >= threshold:
                    best_similarity = similarity
                    best_match = {
                        'phrase': phrase,
                        'song': song_title,
                        'line': line.strip(),
                        'similarity': similarity,
                        'match_type': 'partial'
                    }
            
            if found:
                break
        
        # If we didn't find the exact match, but there is a partial match
        if not found and best_match:
            results['partial_matches'].append(best_match)
            print(f"🔄 PARTIAL ({best_match['similarity']:.1%}): '{phrase}'")
            print(f"   Best match in '{best_match['song']}': '{best_match['line'][:60]}...'")
        elif not found:
            results['no_matches'].append(phrase)
            print(f"❌ NOT FOUND: '{phrase}'")
    
    return results


def find_phrase_fragments(lyrics_dict, phrases_to_check, min_words=2):
    """
    Search for fragments of phrases (subsequences of 2+ words)
    """
    print("\n" + "="*60)
    print("SEARCHING FOR PHRASE FRAGMENTS")
    print("="*60)
    
    for phrase in phrases_to_check:
        phrase_norm = normalize_text(phrase)
        words = phrase_norm.split()
        
        if len(words) < min_words:
            continue
            
        # Generate all possible contiguous subsequences
        fragments_found = []
        
        for i in range(len(words) - min_words + 1):
            for j in range(i + min_words, len(words) + 1):
                fragment = ' '.join(words[i:j])
                
                for song_title, song_lyrics in lyrics_dict.items():
                    if song_lyrics and fragment in normalize_text(song_lyrics):
                        fragments_found.append({
                            'fragment': fragment,
                            'song': song_title,
                            'original_phrase': phrase
                        })
                        break
        
        if fragments_found:
            print(f"\n📍 Original: '{phrase}'")
            for f in fragments_found:
                print(f"   Fragment '{f['fragment']}' found in '{f['song']}'")

# Main analysis
print("="*60)
print("CHECKING APPROACH 1 PHRASES")
print("="*60)
results_1 = check_phrases_in_lyrics_improved(lyrics, phrases_approach_1)

print("\n" + "="*60)
print("CHECKING APPROACH 2 PHRASES")
print("="*60)
results_2 = check_phrases_in_lyrics_improved(lyrics, phrases_approach_2)

# Search for fragments
find_phrase_fragments(lyrics, phrases_approach_1 + phrases_approach_2)

# Final statistics
print("\n" + "="*60)
print("SUMMARY")
print("="*60)

for results, name in [(results_1, "Approach 1"), (results_2, "Approach 2")]:
    total = len(results['exact_matches']) + len(results['partial_matches']) + len(results['no_matches'])
    exact_pct = len(results['exact_matches'])/total*100 if total > 0 else 0
    partial_pct = len(results['partial_matches'])/total*100 if total > 0 else 0
    
    print(f"\n{name}:")
    print(f"  Exact matches: {len(results['exact_matches'])} ({exact_pct:.0f}%)")
    print(f"  Partial matches: {len(results['partial_matches'])} ({partial_pct:.0f}%)")
    print(f"  Not found: {len(results['no_matches'])}")

CHECKING APPROACH 1 PHRASES
❌ NOT FOUND: 'And you come up and I'm just like'
❌ NOT FOUND: 'People went down like'
❌ NOT FOUND: 'Like I play instruments'
❌ NOT FOUND: 'It's just like it's just like, it's about it'
❌ NOT FOUND: 'But it's like I just didn't know this'
❌ NOT FOUND: 'He's just like he's just a maniac'
🔄 PARTIAL (75.0%): 'I just love it'
   Best match in 'CANCELLED': 'I like it, I love it...'
❌ NOT FOUND: 'And he's like, I got you'
❌ NOT FOUND: 'So it's like, we're going, we're going out.'
🔄 PARTIAL (75.0%): 'And you know what?'
   Best match in 'Elizabeth Taylor': 'And I think you know why...'

CHECKING APPROACH 2 PHRASES
❌ NOT FOUND: 'But you did it tonight'
🔄 PARTIAL (75.0%): 'So we got all...'
   Best match in 'Opalite': 'We give it all we got (Give it all we got)...'
❌ NOT FOUND: 'Do you know what I mean?'
❌ NOT FOUND: 'You won't be able to get to bed till four in the morning after this.'
❌ NOT FOUND: 'The Life of a Showgirl featuring Sabrina Carpenter.'
❌ NOT FOUND: 'T