**Imports**

In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
from sklearn.metrics import precision_score, recall_score, f1_score
import random
import glob
import os

---

### Code Snippet 1.
- functions should definitely be *called* all together inside a greater, main function;
- but better not to *define* them all together inside of one function.
    - better to use Classes in this case.
- can also delete old Jupyter code before the pipeline that doesn't do anything anymore.

In [None]:
def train_model(LYRICS_SAMPLE, INTERPRET, NUMBER_OF_SONGS=20):
    
    ...
    
    def collect_song_links(artist:str): 
        ...

    def get_songs_lyrics(links:list, artist_name:str, num:int):
        ...

    def main(artist, num):
        ...

    
    def clean_data(column:pd.Series) -> pd.Series:
        ...

    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(CORPUS)
    y = LABELS
    clf = MultinomialNB()
    clf.fit(X, y)
    MultinomialNB()
    pipeline = make_pipeline(TfidfVectorizer(stop_words='english'),
                            MultinomialNB(class_prior=None))
    pipeline.fit(CORPUS, y)
    

Better / Cleaner to define functions separately but then call them together / inside each other later on.
- Better for long-term maintainability.

If you want to define lots of functions within some greater "function" / unit, better to use Classes.

In [5]:
def train_model(LYRICS_SAMPLE, INTERPRET, NUMBER_OF_SONGS=20):
    ...
 
def collect_song_links(artist:str): 
    ...

def get_songs_lyrics(links:list, artist_name:str, num:int):
    ...

def clean_data(column:pd.Series) -> pd.Series:
    ...
        

a = train_model_model()
b = get_song_lyrics()
# etc....

### Code Snippet 2.
- When you loop through a dataframe, by default it loops through the columns.
- The iterator `i` is unused.


---

In [10]:
links = ['http://www.metrolyrics.com/back-stabber-lyrics-eminem.html',
 'http://www.metrolyrics.com/beautiful-pain-lyrics-eminem.html',
 'http://www.metrolyrics.com/3-verses-lyrics-eminem.html',
 'http://www.metrolyrics.com/another-sentencing-lyrics-eminem.html',
 'http://www.metrolyrics.com/billion-bucks-lyrics-eminem.html',
 'http://www.metrolyrics.com/i-need-a-doctor-lyrics-dr-dre.html',
 'http://www.metrolyrics.com/a-drop-in-the-ocean-lyrics-eminem.html',
 'http://www.metrolyrics.com/bet-shady-20-cypher-lyrics-eminem.html',
 'http://www.metrolyrics.com/atlanta-on-fire-lyrics-eminem.html',
 'http://www.metrolyrics.com/amityville-lyrics-eminem.html']

In [19]:
df = pd.DataFrame(data=links, columns = ['url'])
df.head()

Unnamed: 0,url
0,http://www.metrolyrics.com/back-stabber-lyrics...
1,http://www.metrolyrics.com/beautiful-pain-lyri...
2,http://www.metrolyrics.com/3-verses-lyrics-emi...
3,http://www.metrolyrics.com/another-sentencing-...
4,http://www.metrolyrics.com/billion-bucks-lyric...


In [22]:
for col in df: #rename i to col
    df[['url2','title']] = df['url'].str.split(".com/",expand=True)
    df['title_short'] = [x[:5] for x in df['title']]

**Better**: Remove loop:

In [20]:
df[['url2','title']] = df['url'].str.split(".com/",expand=True)
df['title_short'] = [x[:5] for x in df['title']]

In [24]:
# df

When writing for-loops, just make sure at the end the iterable (e.g. `i`) is used. Otherwise this could mean that you are doing a loop unnecessarily.

### 3. Code Snippet 3.
- Using 2 loops when you could do everything in the same, single loop. (saves time).

---

In [None]:
def get_lyrics(artist_name):
    
    ret_list = list()
    artist_name_request = requests.get(f'http://www.songlyrics.com/{artist_name}-lyrics/').text
    pattern = 'href=\"(http://.+lyrics\/)\"'
    songs_list = re.findall(pattern, artist_name_request)[1:101]
    if not os.path.exists(f'{artist_name}_songs'):
        os.makedirs(f'{artist_name}_songs')
    
    for index,link in enumerate(songs_list):
        new_list = requests.get(link).text
        songs_text = open(f'{artist_name}_songs/songs{index}.txt','w')
        songs_text.write(new_list)
        songs_text.close()
    
    for index,link in enumerate(songs_list):
        #new_list = requests.get(link).text
        songs_text = open(f'{artist_name}_songs/songs{index}.txt')
        song_html_content = songs_text.read()
        songs_text.close()
        songs_soup = BeautifulSoup(song_html_content, 'html')
        lyrics = songs_soup.find_all('p', attrs = {'id':"songLyricsDiv"})[0].text
        cleaned_lyrics = lyrics.replace("\n"," ")
        ret_list.append({"lyrics":cleaned_lyrics,"artist":artist_name})
        # songs_text = open(f'{artist_name}_songs/lyrics{index}.txt','w')
        # songs_text.write(lyrics)
        # songs_text.close()
    return ret_list

Better structure would be to do everything (downloading the data and cleaning it) in the same loop, before writing to a file.
- 2 separate for-loops could be combined into 1 (saves time, b/c we don't have to open and close files twice).

### 4. Code Snippet 4.
- you could do something like `os.getcwd()` to make the path more dynamic
- make the function also return the length of the list so that you don't have to hardcode the label amounts as well.

---

In [None]:
PATH = '/Users/pawlodkowski/lyrics/eminem/'

PATH2 = '/Users/pawlodkowski/lyrics/dualipa/'

def lyrics_to_list(path=str):
    """returns a list of lyricstrings"""

    textfile = glob.glob(os.path.join(path, '*.text'), recursive=False)

    lyrics = []
   
    for file_path in textfile:
        with open(file_path) as f_input:
            lyrics.append(f_input.read())

    return lyrics


LIST_EM = lyrics_to_list(PATH)

LIST_DL = lyrics_to_list(PATH2)

CORPUS = LIST_EM + LIST_DL

LABELS = ["EMINEM"] * 402 + ["DUA LIPA"] * 386
LABLES = ["EMINEM"] * len(LIST_EM) + ["DUA LIPA"] * len(LIST_DL)

Try not to hard-code anything!
- Functions and variables should be given generic names and calculated dynamically.

### 5. Code Snippet 5
- Don't Repeat Yourself (DRY)
- Benefit from a for-loop
    - even if you think you don't need a loop for now, it still makes the code cleaner / more maintainable in the long-run.

---

In [27]:

labels = ['jazz', 'rock']
y_true = [random.choice(labels) for i in range(30)] 
y_pred = [random.choice(labels) for i in range(30)] 

print(f'The precision using punk is: {round(precision_score(y_true, y_pred, average="binary", pos_label="jazz"), 3)}')
print(f'The precision using indie is: {round(precision_score(y_true, y_pred, average="binary", pos_label="rock"), 3)}')

print(f'The recall of using punk is: {round(recall_score(y_true, y_pred, average="binary", pos_label="jazz"), 3)}')
print(f'The recall of using indie is: {round(recall_score(y_true, y_pred, average="binary", pos_label="rock"), 3)}')

print(f'The f1-score using punk is: {round(f1_score(y_true, y_pred, average="binary", pos_label="jazz"), 3)}')
print(f'The f1-score using indie is: {round(f1_score(y_true, y_pred, average="binary", pos_label="rock"), 3)}')

# print(f'The accuracy score using punk is: {round(accuracy_score(y_true, y_pred, average="binary", pos_label="jazz"), 3)}')
# print(f'The accuracy score indie is: {round(accuracy_score(y_true, y_pred, average="binary", pos_label="rock"), 3)}')

The precision using punk is: 0.529
The precision using indie is: 0.308
The recall of using punk is: 0.5
The recall of using indie is: 0.333
The f1-score using punk is: 0.514
The f1-score using indie is: 0.32


---

**BETTER**:

In [39]:
for metric in [precision_score, recall_score, f1_score]:
    
    for la in labels:
        
        print(f'The {metric.__name__} using {la} is: {round(metric(y_true, y_pred, average="binary", pos_label=la), 3)}')
        
        

The precision_score using jazz is: 0.529
The precision_score using rock is: 0.308
The recall_score using jazz is: 0.5
The recall_score using rock is: 0.333
The f1_score using jazz is: 0.514
The f1_score using rock is: 0.32
