# Data Preparation

In [1]:
import re
from os import listdir

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
# nltk.download('stopwords')
# nltk.download('punkt')

In [2]:
def load_bios(path: str) -> list:
    bios = list()
    for f in listdir(path):
         if f.endswith('-bio.txt'):
                with open(f'{path}/{f}', 'r') as txt:
                    bio = txt.read()
                bios.append(bio)
    return bios[:10]


bios = load_bios('../opendata-music-artists-with-bios/bios/')
bios[0]

'Rock Metal from northwestern Spain. The band is still unsigned and  with three works on their backs. Their last release it\'s an acoustic experiment with three electric bonus tracks and it will be release on June 2006.\n\nCheck it out\nhttp://www.orfos.tk\nhttp://www.myspace.com/orfos <a href="https://www.last.fm/music/Orfos">Read more on Last.fm</a>. User-contributed text is available under the Creative Commons By-SA License; additional terms may apply.'

In [3]:
def is_bio(text: str) -> bool:
    if "at least" in text[:200]:
        return False
    elif text.lower().startswith('there are'):
        return False
    elif text.lower().startswith('there is'):
        return False
    return True


real_bios = [b for b in bios if is_bio(b)]
false_bios = list(set(bios).difference(set(real_bios)))
print(f"{len(false_bios)}/{len(bios)} filtered out")

0/10 filtered out


In [4]:
false_bios

[]

In [5]:
def remove_html_tags(text: str) -> str:
    cleaner = re.compile('<.*?>')
    return re.sub(cleaner, '', text)

bios_wo_html = [remove_html_tags(b) for b in real_bios]
bios_wo_html[0]

"Rock Metal from northwestern Spain. The band is still unsigned and  with three works on their backs. Their last release it's an acoustic experiment with three electric bonus tracks and it will be release on June 2006.\n\nCheck it out\nhttp://www.orfos.tk\nhttp://www.myspace.com/orfos Read more on Last.fm. User-contributed text is available under the Creative Commons By-SA License; additional terms may apply."

In [6]:
bios_wo_lastfm_text = [b.split('Read more on Last.fm')[0] for b in bios_wo_html]
bios_wo_lastfm_text[0]

"Rock Metal from northwestern Spain. The band is still unsigned and  with three works on their backs. Their last release it's an acoustic experiment with three electric bonus tracks and it will be release on June 2006.\n\nCheck it out\nhttp://www.orfos.tk\nhttp://www.myspace.com/orfos "

In [7]:
def remove_control_chars(text: str) -> str:
    text = text.replace('\n',' ')
    text = text.replace('\t',' ')
    text = text.replace('\r',' ')
    return text


bios_wo_control_chars = [remove_control_chars(b) for b in bios_wo_lastfm_text]
bios_wo_control_chars[0]

"Rock Metal from northwestern Spain. The band is still unsigned and  with three works on their backs. Their last release it's an acoustic experiment with three electric bonus tracks and it will be release on June 2006.  Check it out http://www.orfos.tk http://www.myspace.com/orfos "

In [8]:
def remove_urls(text: str) -> str:
    return re.sub(r'http\S+', '', text)

bios_wo_urls = [remove_urls(b) for b in bios_wo_control_chars]
bios_wo_urls[0]

"Rock Metal from northwestern Spain. The band is still unsigned and  with three works on their backs. Their last release it's an acoustic experiment with three electric bonus tracks and it will be release on June 2006.  Check it out   "

In [9]:
import spacy

In [10]:
!spacy download en_core_web_sm

Collecting en-core-web-sm==3.4.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.1/en_core_web_sm-3.4.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m22.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [11]:
def lemmatize(text: str, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) -> str:
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    doc = nlp(text)
    new_text = []
    for token in doc:
        if token.pos_ in allowed_postags:
            new_text.append(token.lemma_)
    return ' '.join(new_text)


bios_lemmatized = [lemmatize(b) for b in bios_wo_urls]
bios_lemmatized[0]



'northwestern band still unsigned work back last release acoustic experiment electric bonus track release check'

In [16]:
import gensim


def gen_words(text: str) -> list:
    return gensim.utils.simple_preprocess(text, deacc=True)


data_words = [gen_words(b) for b in bios_lemmatized]
data_words[0]

['northwestern',
 'band',
 'still',
 'unsigned',
 'work',
 'back',
 'last',
 'release',
 'acoustic',
 'experiment',
 'electric',
 'bonus',
 'track',
 'release',
 'check']

# Visualizatoin

In [13]:
import pyLDAvis
import pyLDAvis.gensim_models

  from imp import reload
