In [None]:
!pip install langdetect unidecode

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m37.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: langdetect
  Building wheel for langdetect (setup.py) ... [?25ldone
[?25h  Created wheel for langdetect: filename=langdetect-1.0.9-py3-none-any.whl size=993241 sha256=91b369d402a62171e74dab0a2b325ab9113b5bd564061a7965f0004c1b7f29f9
  Stored in directory: /root/.cache/pip/wheels/d1/c1/d9/7e068de779d863bc8f8fc9467d85e25cfe47fa5051fff1a1bb
Successfully built langdetect
Installing collected packages: unidecode, langdetect
Successfully installed langdetect-1.0.9 unidecode-1.3.8

[1m[[0m[34;49mnotice[0m[1;3

In [None]:
!pip install langdetect

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
from langdetect import detect_langs
import pandas as pd
import re

# Import NLTK
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Download the necessary NLTK resources (only need to do this once)
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

import unidecode

# Import LangDetect
from langdetect import detect, DetectorFactory
# Ensure consistent results from langdetect
DetectorFactory.seed = 0

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
# Function to detect languages
def detect_languages(text):
    try:
        languages = detect_langs(text)
        return str(languages)
    except:
        return 'Unknown'

# Function to check if 'en' has a probability greater than 0.5
def has_high_en_prob(lang_str):
    if lang_str == 'Unknown':
        return False
    for lang in lang_str.split(','):
        lang = lang.strip('[] ')
        if lang.startswith('en'):
            prob = float(lang.split(':')[1])
            if prob > 0.5:
                return True
    return False

In [None]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Helper function to convert NLTK POS tags to WordNet POS tags
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # Default to noun

def clean_and_lemmatize_lyrics(lyrics):
    """
    Cleans and lemmatizes song lyrics by removing unwanted text, normalizing, and lemmatizing words.

    Args:
    - lyrics (str): The lyrics to be cleaned and lemmatized.

    Returns:
    - str: The cleaned and lemmatized lyrics.
    """
    # Step 1: Remove unwanted phrases, section titles, and punctuation
    cleaned_lyrics = re.sub(r"\d+\sContributors.*?Lyrics", "", lyrics, flags=re.DOTALL)
    cleaned_lyrics = re.sub(r"See .*?Get tickets.*?\$\d+", "", cleaned_lyrics, flags=re.DOTALL)
    cleaned_lyrics = re.sub(r"Embed|\(.*?\)|\[.*?\]|\d+|[\",\'\-?!.]", "", cleaned_lyrics)
    cleaned_lyrics = re.sub(r'_+\s*', '', cleaned_lyrics)
    
    # Step 2: Normalize text by removing special patterns and non-alphabet characters
    cleaned_lyrics = re.sub(r'\s+', ' ', cleaned_lyrics.replace("\n", " ")).strip()
    cleaned_lyrics = unidecode.unidecode(cleaned_lyrics.lower())

    # Step 3: Lemmatize the cleaned lyrics
    tokens = word_tokenize(cleaned_lyrics)
    tagged_tokens = pos_tag(tokens)
    lemmatized_tokens = [
        lemmatizer.lemmatize(token, get_wordnet_pos(tag))
        for token, tag in tagged_tokens
    ]

    return ' '.join(lemmatized_tokens)

def normalize_text(text):
    """
    Transforms text with special fonts into normal fonts.

    Args:
    - text (str): The text to be normalized.

    Returns:
    - str: The normalized text.
    """
    cleaned_text = re.sub(r'(a{2,}|h{2,})+', '', text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip()
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', cleaned_text)
    return unidecode.unidecode(cleaned_text)

In [None]:
song_df_with_lyrics = pd.read_csv("/song_df_with_lyrics.csv")
song_df_with_lyrics = song_df_with_lyrics.dropna(subset=["lyrics"]).reset_index(drop=True)
song_df_with_lyrics = song_df_with_lyrics[['spotify_id', 'lyrics']]

In [None]:
# Apply the function to the column and create a new column
song_df_with_lyrics['detected_languages'] = song_df_with_lyrics['lyrics'].apply(detect_languages)

# Filter rows where 'en' has a probability > 0.5
song_df_with_lyrics = song_df_with_lyrics[song_df_with_lyrics['detected_languages'].apply(has_high_en_prob)]

In [None]:
# Apply the clean_and_lemmatize_lyrics function to the "lyrics" column
song_df_with_lyrics['cleaned_lyrics'] = song_df_with_lyrics['lyrics'].apply(clean_and_lemmatize_lyrics)

song_df_with_lyrics['normalized_lyrics'] = song_df_with_lyrics['cleaned_lyrics'].apply(normalize_text)

In [None]:
song_df_with_lyrics = song_df_with_lyrics['normalized_lyrics'].tolist()

In [None]:
song_df_with_lyrics[3]

'a day in falsettoland dr mendel at work you go out on the street and theres all these people ask for a handout you go home you open your mail and it full of people ask for a donation i dont get it then you turn on the tv and they want money for i dont understand starve child in ethiopia i just want to be leave alone in the s everyone have heart in the s we be all a part of the same team in the s we have a new world to start could this oh god dont say it is could this be the new world we start here i sit brokenhearted and do i wait for the promotion or do i take this ibm job yeah well caroline hmm i dont get it ive be leave behind half my patient  yuppie pagan model on the ronald reagans now the world be too pathetic and i dont get it at all oh im in a deep quandary about my career what do you think i should do time up aww at least there trina at home trina in bed trina obsessing and sort of caress my head with her foot i once think it be sweet but i dont anymore now i just snore cause

In [None]:
song_df_with_lyrics.to_csv('/song_df_with_lyrics.csv')