# Lyrics Project - Spiced 2022 - Valentin Lorenzen

### Downloading and cleaning the lyrics

In [1]:
# loading libaries

import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB

import requests
import re
import os

from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

import sys
import argparse

import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer

import csv
import pickle



#### Lady Gaga Lyrics

In [160]:
# get gaga lyric links

all_of_gaga = requests.get('https://www.lyrics.com/artist/Lady-Gaga').text

In [161]:
# make list out of gaga lyric links

gaga_songlinks = re.findall(pattern='a href="(/lyric/.{9})', string=all_of_gaga)

In [162]:
# delete duplicates in gaga lyrics list

gaga_songlinks = list(dict.fromkeys(gaga_songlinks))

In [None]:
# get links to gaga lyrics

gaga_songlinks_final = ["www.lyrics.com/" + x for x in gaga_songlinks]

In [166]:
# cut list short to first 100 songs

gaga_songlinks_final = gaga_songlinks_final[0:50]

In [None]:
# download lyrics

for songlink in gaga_songlinks_final:
    song_number = re.findall(pattern='(\d{6,9})', string=songlink)
    print(songlink)
    f = open(f"gaga_lyrics/{song_number}.txt", "w")
    song_html = requests.get(songlink).text
    f.write(song_html)
    f.close()
    print (song_number)

In [5]:
# put lyrics into list

gaga_lyrics = []

for fn in os.listdir('gaga_lyrics/'):
     text = open('gaga_lyrics/' + fn).read()
     gaga_soup = BeautifulSoup(text, 'html.parser')
     #print (gaga_soup.type)
     lyric = gaga_soup.find_all('pre',{"class":"lyric-body"})
     gaga_lyrics.append(lyric)


In [6]:
# clean up lyrics in list

cleaner = re.compile('<.*?>') 

def cleanhtml(raw_html):
  cleantext = re.sub(cleaner, '', raw_html)
  return cleantext

gaga_lyrics_clean = []

for x in gaga_lyrics:
    x_clean = cleanhtml(str(x))
    x_clean = re.sub('[^A-Za-z\s]+', '', x_clean)
    gaga_lyrics_clean.append(x_clean.lower().replace('\n', ' ').replace("\'", ""))


#### Thundercat Lyrics

In [4]:
# get thundercat lyric links

all_of_thundercat = requests.get('https://www.lyrics.com/artist/Thundercat/2127533').text

In [5]:
# make list out of thundercat lyric links

thundercat_songlinks = re.findall(pattern='a href="(/lyric/.{9})', string=all_of_thundercat)

In [223]:
# delete duplicates in thundercat lyrics list

thundercat_songlinks = list(dict.fromkeys(thundercat_songlinks))

In [7]:
# get links to thundercat lyrics

thundercat_songlinks_final = ["www.lyrics.com/" + x for x in thundercat_songlinks]


In [225]:
# cut list short to first 100 songs

thundercat_songlinks_final = thundercat_songlinks_final[0:50]

In [None]:
# download lyrics

for songlink in thundercat_songlinks_final:
    song_number = re.findall(pattern='(\d{6,9})', string=songlink)
    print(songlink)
    f = open(f"thundercat_lyrics/{song_number}.txt", "w")
    song_html = requests.get(songlink).text
    f.write(song_html)
    f.close()
    print (song_number)

In [3]:
# put lyrics into list

thundercat_lyrics = []

for fn in os.listdir('thundercat_lyrics/'):
     text = open('thundercat_lyrics/' + fn).read()
     thundercat_soup = BeautifulSoup(text, 'html.parser')
     #print (thundercat_soup.type)
     lyric = thundercat_soup.find_all('pre',{"class":"lyric-body"})
     thundercat_lyrics.append(lyric)


In [4]:
# clean up lyrics in list

cleaner = re.compile('<.*?>') 

def cleanhtml(raw_html):
  cleantext = re.sub(cleaner, '', raw_html)
  return cleantext

thundercat_lyrics_clean = []

for x in thundercat_lyrics:
    x_clean = cleanhtml(str(x))
    x_clean = re.sub('[^A-Za-z\s]+', '', x_clean)
    thundercat_lyrics_clean.append(x_clean.lower().replace('\n', ' ').replace("\'", ""))


### Combining lyrics to corpus, tokenizing and lemmatizing

In [7]:
# create list of all lyrics

corpus = thundercat_lyrics_clean + gaga_lyrics_clean

In [None]:
# download nltk - package

nltk.download('omw-1.4')

In [13]:
# tokenize and lemmatize the corpus

tokenizer = TreebankWordTokenizer()
lemmatizer = WordNetLemmatizer()

clean_corpus = []

for doc in corpus:
    tokens = tokenizer.tokenize(text=doc)
    clean_doc = " ".join(lemmatizer.lemmatize(token) for token in tokens)
    clean_corpus.append(clean_doc)

### Bag of Words

In [None]:
# create stopwords list

nltk.download('stopwords')

from nltk.corpus import stopwords

STOPWORDS = stopwords.words('english')

In [None]:
# create labels

LABELS = ["Thundercat"] * 50 + ["Lady Gaga"] * 50

In [25]:
# create pipeline

steps = [
          ('tf-idf', TfidfVectorizer(stop_words=STOPWORDS)),        
          ('LR', MultinomialNB())
        ]

pipeline = Pipeline(steps)

In [None]:
# fit pipeline on data

pipeline.fit(clean_corpus, LABELS)

### Function, csv-export and model-export

In [46]:
# Function to give probablity of lyric

def give_artist(lyric):
    if not lyric:
        return "You did not give an input"
    probab = pipeline.predict_proba([lyric])
    who_wrote = pipeline.predict([lyric])[0]
    func_return = "This was probably written by: " + who_wrote
    if who_wrote == "Lady Gaga":
        func_return = func_return + " , with a certainty of: " + str(round(100*probab[0][0])) + "%"
    else:
        func_return = func_return + " , with a certainty of: " + str(round(100*probab[0][1])) + "%"
    return str(func_return)

In [58]:
# export clean_corpus

file = open('clean_corpus.csv', 'w+', newline ='') 
with file:
    for lyrics in clean_corpus:     
        lyric = lyrics.split(',')
        write = csv.writer(file) 
        write.writerow(lyric) 

In [None]:
# export model

model_export = pipeline.fit(clean_corpus, LABELS)
with open("naive_classifier.bin", "wb") as file:
    pickle.dump(NB_clf, file)