# 1. Goal
<hr style="border:2px solid gray">

In this project, we will build a text classification model on song lyrics. The task is to predict the artist from a piece of text.

# 2. Data Collection
<hr style="border:2px solid gray">

Run code from [github](https://github.com/tamimibrahim17/List-of-user-agents) in getting user agents and save it on your local folder. We can then read the files and get a random user agent

In [1]:
import random, requests, time, re, importlib, string
from difflib import SequenceMatcher
import pandas as pd
from bs4 import BeautifulSoup

from sklearn import set_config
set_config(transform_output='pandas')

# import customized module
import generic_functions as gf

# reload module if it has been updated
importlib.reload(gf)

<module 'generic_functions' from '/Users/nadine/Documents/Spiced_Academy/github/vectorized-vegeta-student-code/week_04/project_code/generic_functions.py'>

### Create the list of artists and their URLs

In [2]:
# url_list = ['https://www.lyrics.com/artist/Maroon-5/529962','https://www.lyrics.com/artist/Backstreet-Boys/199819']
# artist_list = ['maroon_5','backstreet_boys']
url_list = ['https://www.lyrics.com/artist/Spice-Girls/199833','https://www.lyrics.com/artist/My-Chemical-Romance/533805']
artist_list = ['spice_girls','my_chemical_romance']

### Scrape html page of each artist and write to file

In [4]:
for idx in range(len(url_list)):
    response = requests.get(url_list[idx], headers=gf.get_header())
    gf.write_file(response.text, f'{artist_list[idx]}.html')

### Read created artist files (get list of songs and links)

In [6]:
def get_info(artist):
    base_url = 'https://www.lyrics.com'
    filename = f'{artist}.html'
    
    # define list
    song_list = list()
    
    html_doc = gf.read_file(f'{artist}/{filename}')
    
    # create a BeautifulSoup object from the HTML
    soup = BeautifulSoup(markup=html_doc, features='html.parser')
    
    # find all <a> tags with href that contains '/lyric/'
    links = soup.find_all('a',href=lambda href: href and '/lyric/' in href)
    
    # create the url and get the song titles
    for link in links:
        # song_list.append([base_url+link.get('href'), link.text, artist])
        song_list.append([base_url+link.get('href'), gf.format_title(link.text), artist])

    #artist.replace('_',' ').title()
    df_songs = pd.DataFrame(song_list, columns=['link','title','artist'])

    # drop duplicates
    df_songs.drop_duplicates(subset=['title'], keep='first',inplace=True, ignore_index=True)

    # export to check values
    # df_songs.to_csv(f'{artist}_before_format.csv', index=False)
    # df_songs.to_csv(f'{artist}_after_format.csv', index=False)

    return df_songs
    

In [7]:
# initial time
ti = time.time()

df_all_songs = pd.DataFrame()

for artist in artist_list:
    df_artist_songs = get_info(artist)
    df_all_songs = pd.concat([df_all_songs,df_artist_songs],ignore_index=True)

# final time 
tf = time.time()

# time taken
print(f"time taken: {round(tf-ti,2)} sec")

time taken: 1.34 sec


In [8]:
# check the df
df_all_songs

Unnamed: 0,link,title,artist
0,https://www.lyrics.com/lyric/35092433/Spice+Gi...,Wannabe,spice_girls
1,https://www.lyrics.com/lyric/35074610/Spice+Gi...,Mama,spice_girls
2,https://www.lyrics.com/lyric/35134498/Spice+Gi...,In_The_Summertime,spice_girls
3,https://www.lyrics.com/lyric/35134496/Spice+Gi...,Spice_Up_Your_Life,spice_girls
4,https://www.lyrics.com/lyric/30734178/Spice+Gi...,Stop,spice_girls
...,...,...,...
135,https://www.lyrics.com/lyric/28903492/My+Chemi...,The_World_Is_Ugly,my_chemical_romance
136,https://www.lyrics.com/lyric/28903491/My+Chemi...,The_Light_Behind_Your_Eyes,my_chemical_romance
137,https://www.lyrics.com/lyric/28902952/My+Chemi...,Kiss_The_Ring,my_chemical_romance
138,https://www.lyrics.com/lyric/28902951/My+Chemi...,Make_Room!!!!,my_chemical_romance


### Loop through each title and link, to scrape the website and save the lyrics of each song

In [9]:
# initial time
ti = time.time()

for idx,data in df_all_songs.iterrows():
    response = requests.get(data['link'], headers=gf.get_header())
    filename = f"{data['artist']}/{idx}_{data['title']}.html"
    print(filename)
    gf.write_file(response.text, filename)
    time.sleep(random.randint(1,3))

# final time 
tf = time.time()

# time taken
print(f"time taken: {round(tf-ti,2)} sec")

spice_girls/0_Wannabe.html
spice_girls/1_Mama.html
spice_girls/2_In_The_Summertime.html
spice_girls/3_Spice_Up_Your_Life.html
spice_girls/4_Stop.html
spice_girls/5_2_Become_1.html
spice_girls/6_Who_Do_You_Think_You_Are.html
spice_girls/7_2_Become_One.html
spice_girls/8_Christmas_Wrapping.html
spice_girls/9_Too_Much.html
spice_girls/10_Sleigh_Ride.html
spice_girls/11_Say_You'll_Be_There.html
spice_girls/12_Move_Over.html
spice_girls/13_Viva_Forever.html
spice_girls/14_Let_Love_Lead_The_Way.html
spice_girls/15_Holler.html
spice_girls/16_Headlines.html
spice_girls/17_Voodoo.html
spice_girls/18_Goodbye.html
spice_girls/19_Tell_Me_Why.html
spice_girls/20_Can_I_Get_Wit'cha?.html
spice_girls/21_Please_Come_Home_For_Christmas.html
spice_girls/22_Christmas_Wrappping.html
spice_girls/23_Right_Back_At_Ya.html
spice_girls/24_Get_Down_With_Me.html
spice_girls/25_Wasting_My_Time.html
spice_girls/26_Weekend_Love.html
spice_girls/27_Time_Goes_By.html
spice_girls/28_If_You_Wanna_Have_Some_Fun.html
spic

### Read the saved lyrics and add it to the dataframe: df_all_songs

In [10]:
for idx,data in df_all_songs.iterrows():
    filename = f"{data['artist']}/{idx}_{data['title']}.html"
    print(filename)
    df_all_songs.loc[idx,'lyrics'] = gf.get_lyrics(filename)

spice_girls/0_Wannabe.html
spice_girls/1_Mama.html
spice_girls/2_In_The_Summertime.html
spice_girls/3_Spice_Up_Your_Life.html
spice_girls/4_Stop.html
spice_girls/5_2_Become_1.html
spice_girls/6_Who_Do_You_Think_You_Are.html
spice_girls/7_2_Become_One.html
spice_girls/8_Christmas_Wrapping.html
spice_girls/9_Too_Much.html
spice_girls/10_Sleigh_Ride.html
spice_girls/11_Say_You'll_Be_There.html
spice_girls/12_Move_Over.html
spice_girls/13_Viva_Forever.html
spice_girls/14_Let_Love_Lead_The_Way.html
spice_girls/15_Holler.html
spice_girls/16_Headlines.html
spice_girls/17_Voodoo.html
spice_girls/18_Goodbye.html
spice_girls/19_Tell_Me_Why.html
spice_girls/20_Can_I_Get_Wit'cha?.html
spice_girls/21_Please_Come_Home_For_Christmas.html
spice_girls/22_Christmas_Wrappping.html
spice_girls/23_Right_Back_At_Ya.html
spice_girls/24_Get_Down_With_Me.html
spice_girls/25_Wasting_My_Time.html
spice_girls/26_Weekend_Love.html
spice_girls/27_Time_Goes_By.html
spice_girls/28_If_You_Wanna_Have_Some_Fun.html
spic

In [588]:
df_all_songs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140 entries, 0 to 139
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   link    140 non-null    object
 1   title   140 non-null    object
 2   artist  140 non-null    object
 3   lyrics  117 non-null    object
dtypes: object(4)
memory usage: 4.5+ KB


**Drop rows without lyrics (some links exist but don't have lyrics in it)**

In [596]:
df_clean = df_all_songs.copy().dropna()

In [597]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 117 entries, 0 to 139
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   link    117 non-null    object
 1   title   117 non-null    object
 2   artist  117 non-null    object
 3   lyrics  117 non-null    object
dtypes: object(4)
memory usage: 4.6+ KB


In [598]:
df_clean.head()

Unnamed: 0,link,title,artist,lyrics
0,https://www.lyrics.com/lyric/35092433/Spice+Gi...,Wannabe,spice_girls,"Ha ha ha ha ha\nYo, I'll tell you what I want,..."
1,https://www.lyrics.com/lyric/35074610/Spice+Gi...,Mama,spice_girls,She used to be my only enemy and never let me ...
2,https://www.lyrics.com/lyric/35134498/Spice+Gi...,In_The_Summertime,spice_girls,It's a summertime affair\nShaggy ha\nRavyon\nS...
3,https://www.lyrics.com/lyric/35134496/Spice+Gi...,Spice_Up_Your_Life,spice_girls,La la la la la la la la la \nLa la la la la la...
4,https://www.lyrics.com/lyric/30734178/Spice+Gi...,Stop,spice_girls,"You just walk in, I make you smile\nIt's cool ..."


**Clean dataframe**
- Split lyrics with "\n" and assign it to multiple rows
- Drop columns "link" and "title"
- Remove _ in artist column and capitalize names
- Set artist as index

In [599]:
# df_all_songs
df_clean_all = gf.clean_dataframe(df_clean.copy())

In [600]:
df_clean_all

Unnamed: 0_level_0,lyrics
artist,Unnamed: 1_level_1
Spice Girls,Ha ha ha ha ha
Spice Girls,"Yo, I'll tell you what I want, what I really, ..."
Spice Girls,"So tell me what you want, what you really, rea..."
Spice Girls,"I'll tell you what I want, what I really, real..."
Spice Girls,"So tell me what you want, what you really, rea..."
...,...
My Chemical Romance,"Way down, way down"
My Chemical Romance,"Way down, way down"
My Chemical Romance,"Way down, way down"
My Chemical Romance,"Way down, way down"


In [None]:
y = df_all_songs['artist']
y.value_counts(normalize=True)

# 3. Bag of Words
<hr style="border:2px solid gray">

## Preprocessing

**Steps to do:**
1. Tokenisation
2. Clean the text (capitalization, punctuations)
3. Stemming - the reduction of the word to its (pseudo)stem by removing suffixes via some heuristic rules. Does not always result in a real word at the end
4. Lemmatisation - the conversion of a word to its dictionary form
5. Removing stopwords
6. Vectorization

#### CountVectorizer
- Remove list of stopwords
- Remove punctuation marks
- Remove the words that appear in more than X% of documents


In [601]:
import nltk
import string

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS as skl_stopwords

lemmatizer = WordNetLemmatizer()
tokenizer= TreebankWordTokenizer()

In [602]:
# This is a custom function to strip punctuation, tokenize and lemmatize as well as remove stopwords
def tokenize_lemmatize(text, stopwords=skl_stopwords, tokenizer=tokenizer, lemmatizer=lemmatizer):
    text = ''.join([ch for ch in text if ch not in string.punctuation]) #remove punctuation
    tokens = nltk.word_tokenize(text) 
    return [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords] 
    #we have to filter for stopwprds at this stage, otherwise some stopwords get truncated and missed if passed to CountVectorizer

In [603]:
count_vectorizer = CountVectorizer(tokenizer=tokenize_lemmatize, stop_words=None, max_df=0.9) 
#stop_words=None here as we already filtered for them.

X = count_vectorizer.fit_transform(df_clean_all['lyrics'])
X_df = pd.DataFrame(X.todense(), columns=count_vectorizer.get_feature_names_out(), index=df_clean_all.index)
X_df



Unnamed: 0_level_0,10,102,22,5,8,81,9,90,98,aah,...,zaaaaahhhhh,zaaahhhh,zero,zigazig,ziggy,zing,zinga,zombie,zone,zoom
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Spice Girls,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Spice Girls,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Spice Girls,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Spice Girls,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Spice Girls,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
My Chemical Romance,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
My Chemical Romance,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
My Chemical Romance,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
My Chemical Romance,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Normalization using TfidTransformer

In [605]:
from sklearn.feature_extraction.text import TfidfTransformer

tfid_transformer = TfidfTransformer() 
#Use the CountVectorised data
X_norm = tfid_transformer.fit_transform(X)

X_norm_df=pd.DataFrame(X_norm.todense(), columns=count_vectorizer.get_feature_names_out(), index=df_clean_all.index)
X_norm_df

Unnamed: 0_level_0,10,102,22,5,8,81,9,90,98,aah,...,zaaaaahhhhh,zaaahhhh,zero,zigazig,ziggy,zing,zinga,zombie,zone,zoom
artist,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Spice Girls,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Spice Girls,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Spice Girls,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Spice Girls,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Spice Girls,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
My Chemical Romance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
My Chemical Romance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
My Chemical Romance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
My Chemical Romance,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 4. Train-Test Split
<hr style="border:2px solid gray">

In [606]:
from sklearn.model_selection import train_test_split

# reset index
X_reset_df = X_norm_df.reset_index()

# # feature
X = X_reset_df.drop(columns=['artist'], axis=1)

# # target
y = X_reset_df['artist']

X_train, X_val, y_train, y_val = train_test_split(X,y, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((5596, 2330), (1866, 2330), (5596,), (1866,))

In [607]:
y.value_counts()

Spice Girls            3741
My Chemical Romance    3721
Name: artist, dtype: int64

In [608]:
y.value_counts(normalize=True)

Spice Girls            0.50134
My Chemical Romance    0.49866
Name: artist, dtype: float64

# 5. Model Building

## a. Random Forest

In [609]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# train data
clf_RF = RandomForestClassifier(n_estimators=15, max_depth=5, random_state=42)
clf_RF.fit(X_train, y_train)

In [610]:
def print_evaluations(ytrue, ypred, model):
    print(f'How does model {model} score:')
    print(f'The accuracy of the model is: {round(accuracy_score(ytrue, ypred), 3)}')
    print(f"The precision of the model is: {round(precision_score(ytrue, ypred, pos_label='Spice Girls'), 3)}")
    print(f"The recall of the model is: {round(recall_score(ytrue, ypred,pos_label='Spice Girls'), 3)}")
    print(f"The f1-score of the model is: {round(f1_score(ytrue, ypred, pos_label='Spice Girls'), 3)}")

In [611]:
rf_predictions = clf_RF.predict(X_val)

In [612]:
print_evaluations(y_val, rf_predictions, "Random Forest predictions")

How does model Random Forest predictions score:
The accuracy of the model is: 0.625
The precision of the model is: 0.852
The recall of the model is: 0.312
The f1-score of the model is: 0.457


In [613]:
# train accuracy
clf_RF.score(X_train, y_train)

0.6300929235167977

In [614]:
# test accuracy
clf_RF.score(X_val, y_val)

0.62486602357985

## b. Naives Bayes

In [615]:
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics

naive_bayes = MultinomialNB(alpha=1)
naive_bayes.fit(X_train, y_train)

# y_predicteds = model.predict(X_train)

In [616]:
# train accuracy
naive_bayes.score(X_train, y_train)

0.8293423874195854

In [617]:
# test accuracy
naive_bayes.score(X_val, y_val)

0.7845659163987139

In [618]:
# Create the predicted tags: pred
pred = naive_bayes.predict(X_val)

print_evaluations(y_val, pred, "Naive Bayes")

How does model Naive Bayes score:
The accuracy of the model is: 0.785
The precision of the model is: 0.838
The recall of the model is: 0.712
The f1-score of the model is: 0.769


In [619]:
metrics.confusion_matrix(y_val, pred)

array([[793, 130],
       [272, 671]])

In [620]:
test_string = ["wannabe"]
X_vectorizer_test = count_vectorizer.transform(test_string)
X_norm_test = tfid_transformer.fit_transform(X_vectorizer_test)
pred = naive_bayes.predict(X_norm_test)
pred[0]



'Spice Girls'

In [621]:
X_vec_test = tf_vectorizer.transform(test_string)
pred = naive_bayes.predict(X_vec_test)
pred[0]

'Spice Girls'

# 6. Save Models

In [None]:
# set models in a dictionary
models = {'count_vectorizer':count_vectorizer,
          'tfid_transformer':tfid_transformer,
          'naive_bayes':clf_NB}

for filename, model in models.items():
    gf.save_model("models/"+filename,model)