In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Data Cleaning

<img src="docu/pictures/data-cleaning.webp" style="width: 60%; align: center;" />

## Loading the Data

In [2]:
from os import listdir


def load_bios(path: str) -> list:
    bios = list()
    for f in listdir(path):
         if f.endswith('-bio.txt'):
                with open(f'{path}/{f}', 'r') as txt:
                    bio = txt.read()
                bios.append(bio)
    return bios


bios = load_bios('../opendata-music-artists-with-bios/bios/')[:100]
bios[0]

'Rock Metal from northwestern Spain. The band is still unsigned and  with three works on their backs. Their last release it\'s an acoustic experiment with three electric bonus tracks and it will be release on June 2006.\n\nCheck it out\nhttp://www.orfos.tk\nhttp://www.myspace.com/orfos <a href="https://www.last.fm/music/Orfos">Read more on Last.fm</a>. User-contributed text is available under the Creative Commons By-SA License; additional terms may apply.'

## Removing Ambiguous Data

In [3]:
def is_bio(text: str) -> bool:
    if "at least" in text[:200]:
        return False
    elif text.lower().startswith('there are'):
        return False
    elif text.lower().startswith('there is'):
        return False
    return True


real_bios = [b for b in bios if is_bio(b)]
false_bios = list(set(bios).difference(set(real_bios)))
print(f"{len(false_bios)}/{len(bios)} filtered out")

5/100 filtered out


## Removing HTML Code, Control Chars, URLs

In [4]:
import re


def remove_html_tags(text: str) -> str:
    cleaner = re.compile('<.*?>')
    return re.sub(cleaner, '', text)

bios_wo_html = [remove_html_tags(b) for b in real_bios]
bios_wo_html[0]

"Rock Metal from northwestern Spain. The band is still unsigned and  with three works on their backs. Their last release it's an acoustic experiment with three electric bonus tracks and it will be release on June 2006.\n\nCheck it out\nhttp://www.orfos.tk\nhttp://www.myspace.com/orfos Read more on Last.fm. User-contributed text is available under the Creative Commons By-SA License; additional terms may apply."

In [5]:
def remove_control_chars(text: str) -> str:
    text = text.replace('\n',' ')
    text = text.replace('\t',' ')
    text = text.replace('\r',' ')
    return text


bios_wo_control_chars = [remove_control_chars(b) for b in bios_wo_html]
bios_wo_control_chars[0]

"Rock Metal from northwestern Spain. The band is still unsigned and  with three works on their backs. Their last release it's an acoustic experiment with three electric bonus tracks and it will be release on June 2006.  Check it out http://www.orfos.tk http://www.myspace.com/orfos Read more on Last.fm. User-contributed text is available under the Creative Commons By-SA License; additional terms may apply."

In [6]:
def remove_urls(text: str) -> str:
    return re.sub(r'http\S+', '', text)

bios_wo_urls = [remove_urls(b) for b in bios_wo_control_chars]
bios_wo_urls[0]

"Rock Metal from northwestern Spain. The band is still unsigned and  with three works on their backs. Their last release it's an acoustic experiment with three electric bonus tracks and it will be release on June 2006.  Check it out   Read more on Last.fm. User-contributed text is available under the Creative Commons By-SA License; additional terms may apply."

In [7]:
bios_wo_lastfm_text = [b.split('Read more on Last.fm')[0] for b in bios_wo_urls]
bios_wo_lastfm_text[0]

"Rock Metal from northwestern Spain. The band is still unsigned and  with three works on their backs. Their last release it's an acoustic experiment with three electric bonus tracks and it will be release on June 2006.  Check it out   "

## Converting Words to Their Root Form

In [8]:
import spacy

In [9]:
# !spacy download en_core_web_sm

In [10]:
def lemmatize(text: str, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']) -> str:
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
    doc = nlp(text)
    new_text = []
    for token in doc:
        if token.pos_ in allowed_postags:
            new_text.append(token.lemma_)
    return ' '.join(new_text)

from joblib import Parallel, delayed
bios_lemmatized = Parallel(n_jobs=8, verbose=5, backend="loky")(map(delayed(lemmatize), bios_wo_lastfm_text))


# bios_lemmatized = [lemmatize(b) for b in bios_wo_lastfm_text]
bios_lemmatized[0]

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done   2 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done  56 tasks      | elapsed:    2.9s
[Parallel(n_jobs=8)]: Done  95 out of  95 | elapsed:    4.4s finished


'northwestern band still unsigned work back last release acoustic experiment electric bonus track release check'

## Creating List of Words

In [11]:
import gensim


def gen_words(text: str) -> list:
    return gensim.utils.simple_preprocess(text, deacc=True)


data_words = [gen_words(b) for b in bios_lemmatized]
data_words[0]

['northwestern',
 'band',
 'still',
 'unsigned',
 'work',
 'back',
 'last',
 'release',
 'acoustic',
 'experiment',
 'electric',
 'bonus',
 'track',
 'release',
 'check']

# Create LDA Model

<img src="docu/pictures/creating-the-model.jpg" style="width: 60%; align: center;" />

In [12]:
#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import LdaModel


id2word = corpora.Dictionary(data_words)
corpus = [id2word.doc2bow(text) for text in data_words]
word = id2word[[0][:1][0]]
print (corpus[0][0:20])
print (word)

lda_model = LdaModel(corpus=corpus,
                     id2word=id2word,
                     num_topics=15,
                     random_state=100,
                     update_every=1,
                     chunksize=100,
                     passes=10,
                     alpha="auto")

[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 2), (10, 1), (11, 1), (12, 1), (13, 1)]
acoustic


# Visualization

<img src="docu/pictures/visualization.jpg" style="width: 60%; align: center;" />

In [13]:
import pyLDAvis
import pyLDAvis.gensim_models


pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus, id2word, mds="mmds", R=30)
vis

  from imp import reload
  by='saliency', ascending=False).head(R).drop('saliency', 1)
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
  from imp import reload
