# Topic Modelling

From someone who does not understand Machine Learning.

In [1]:
import gensim
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LsiModel
from gensim.similarities import MatrixSimilarity

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

from pprint import pprint

In [2]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/michael/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/michael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Utils

#### Serialize

In [3]:
import pickle


def pickle_save(obj, path):
     with open(path, 'wb') as out:
        pickle.dump(obj, out, protocol=pickle.HIGHEST_PROTOCOL)
            

def pickle_load(path):
     with open(path, 'rb') as obj:
        return pickle.load(obj)

#### Text Processing

In [21]:
import re


def _remove_html_tags(text):
    cleaner = re.compile('<.*?>')
    return re.sub(cleaner, '', text)


def _remove_control_chars(text):
    text = text.replace('\n','')
    text = text.replace('\t','')
    return text


def _remove_stop_words(text):
    stop_words = list()
    stop_words.extend(stopwords.words('english'))

    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    result = []
    for word in words:
        if word not in stop_words:
            result.append(word)
    return " ".join(list(result))


def clean_text(text):
    text = _remove_html_tags(text)
    text = text.lower()
    text = _remove_control_chars(text)
    text = _remove_stop_words(text)
    return text

#### Gather Data

In [5]:
import pylast
import settings


client = pylast.LastFMNetwork(api_key=settings.API_KEY,
                              api_secret=settings.API_SECRET,
                              username=settings.USER,
                              password_hash=pylast.md5(settings.PASSWORD))


def get_artist_bio(artist_name: str):
    try:
        artist = client.get_artist(artist_name)
        bio = artist.get_bio_content()

        if bio and 'There are at least' not in bio:  # No perfect match found
            return bio
    except pylast.WSError:
        print(f"Bio for {artist_name} cannot be found.")
    except pylast.MalformedResponseError:
        print(f"Bio for {artist_name} cannot be found.")

# Buckle Up!

In [6]:
FROM_SCRATCH = True
INPUT_DATA_PATH = 'data/data.csv'
MODEL_PATH = 'model.pickle'

In [7]:
!tree -I venv # With ! you can run bash commands in Jupyter

[01;34m.[0m
├── [00mREADME.md[0m
├── [01;34m__pycache__[0m
│   └── [00msettings.cpython-310.pyc[0m
├── [00mcreate_csv.py[0m
├── [01;34mdata[0m
│   └── [00mdata.csv[0m
├── [00mgather_artist_bios.py[0m
├── [00mgather_artist_names.py[0m
├── [00mmapping.pickle[0m
├── [00mmodel.pickle[0m
├── [00mrequirements.txt[0m
├── [00msettings.py[0m
└── [00mtopic-modelling-workshop.ipynb[0m

2 directories, 11 files


## Data Cleansing and Preparation

We create so called documents that consist of the artist name and their biography in tokenized format.

The mapping looks something like this

```
{
    1: {'artist_name': '50 Cent', 'bio': ['a', 'text', 'about', '50', 'Cent'},
    2: {'artist_name': 'Eminem', 'bio': ['a', 'text', 'about', 'Eminem'},
    3: {'artist_name': 'Metallica', 'bio': ['a', 'text', 'about', 'Metallica'},
    ...
}
```

### Input Data

In [8]:
import csv


def load_csv(path: str) -> list:
    with open(path, 'r', newline='') as f:
        csv_reader = csv.reader(f, delimiter=',', quotechar='"')
        return [row for row in csv_reader]

    
rows = load_csv(INPUT_DATA_PATH)

In [9]:
pprint(rows[2727])

['Abhorration',
 'There is more than one artist under this name: 1) SBDM band from California, '
 'USA. Formed in 2011, they released their debut EP "Infatuation with the '
 'Accursed Enmity" in 2012.  2) Deathrash band from Oslo, Norway formed in '
 '2020 with their 2021 demo "After Winter Comes War" as their only release as '
 'of yet. Features members from  Condor and Obliteration <a '
 'href="https://www.last.fm/music/Abhorration">Read more on Last.fm</a>. '
 'User-contributed text is available under the Creative Commons By-SA License; '
 'additional terms may apply.']


In [10]:
mapping = {}
for doc_number, r in enumerate(rows, 1):
    mapping[doc_number] = {
        'artist_name': r[0], 
        'bio': clean_text(r[1])
    }

In [11]:
pprint(mapping[2728])

{'artist_name': 'Abhorration',
 'bio': ['one',
         'artist',
         'name',
         '1',
         'sbdm',
         'band',
         'california',
         'usa',
         'formed',
         '2011',
         'released',
         'debut',
         'ep',
         'infatuation',
         'accursed',
         'enmity',
         '2012',
         '2',
         'deathrash',
         'band',
         'oslo',
         'norway',
         'formed',
         '2020',
         '2021',
         'demo',
         'winter',
         'comes',
         'war',
         'release',
         'yet',
         'features',
         'members',
         'condor',
         'obliteration',
         'read',
         'last',
         'fm',
         'user',
         'contributed',
         'text',
         'available',
         'creative',
         'commons',
         'sa',
         'license',
         'additional',
         'terms',
         'may',
         'apply']}


## Create model

#### <Machine Learning Magic World 🧙‍♂️>

The following code is not there to be understood. Thanks. `¯\_(ツ)_/¯`

In [22]:
from dataclasses import dataclass


@dataclass
class MyModel:
    dictionary: Dictionary
    tfidf_model: TfidfModel
    lsi_model: LsiModel
    index: MatrixSimilarity
    

if FROM_SCRATCH:
    dataset = [mapping[i]['bio'] for i in mapping]
    
    dct = Dictionary(dataset)

    corpus = [dct.doc2bow(line) for line in dataset]
    tfidf_model = TfidfModel(corpus)
    tfidf_corpus = tfidf_model[corpus]

    lsi_model = LsiModel(tfidf_corpus, id2word=dct, num_topics=15, power_iters=4)

    index = MatrixSimilarity(lsi_model[tfidf_corpus])
    my_model = MyModel(dictionary=dct,
                       tfidf_model=tfidf_model,
                       lsi_model=lsi_model,
                       index=index)
    pickle_save(my_model, MODEL_PATH)
else:
    my_model = pickle_load(MODEL_PATH)




#### </Machine Learning Magic World 🧙‍♂️>

## Query the Model

Create the query entity data as we did when creating the model

In [13]:
query_bio = clean_text(get_artist_bio('Britney Spears'))

#### <Machine Learning Magic World 🧙‍♂️>

The following code is not there to be understood. Thanks. `¯\_(ツ)_/¯`

In [14]:
vec_bow = my_model.dictionary.doc2bow(query_bio)
vec_lsi = my_model.lsi_model[my_model.tfidf_model[vec_bow]]
similar_entities = my_model.index[vec_lsi]

#### </Machine Learning Magic World 🧙‍♂️>

## The Result

We got all the doc numbers (see `mapping`) and how similar they are to the queried artist (0.0 to 1.0). 

In [15]:
doc_to_similarity = dict(enumerate(similar_entities))
pprint(doc_to_similarity)

{0: -0.0113136275,
 1: 0.07409112,
 2: 0.17584795,
 3: 0.06266001,
 4: 0.12325067,
 5: 0.08256437,
 6: 0.09130259,
 7: 0.048226744,
 8: 0.25206625,
 9: 0.21663222,
 10: 0.08024966,
 11: 0.2015404,
 12: 0.06808707,
 13: 1.3808603e-06,
 14: 0.013986649,
 15: 0.14630723,
 16: 0.030662872,
 17: 0.064500265,
 18: 0.21920243,
 19: 0.086946376,
 20: 0.108708665,
 21: 0.32941443,
 22: 0.2001233,
 23: 0.13427219,
 24: 0.08554793,
 25: 0.10470651,
 26: 0.13204582,
 27: 0.11114198,
 28: 0.10063235,
 29: 0.055348158,
 30: 0.20797938,
 31: 0.09364907,
 32: 0.02613616,
 33: 0.030814148,
 34: 0.3932996,
 35: 0.15726888,
 36: 0.32313403,
 37: 0.16279078,
 38: 0.14467505,
 39: 0.028131373,
 40: 0.1144508,
 41: 0.0346746,
 42: 0.24085106,
 43: 0.06333703,
 44: 0.08889941,
 45: 0.172476,
 46: -0.00012283772,
 47: 0.45864564,
 48: 0.12608984,
 49: 0.016959915,
 50: 0.39596814,
 51: 0.15313189,
 52: 0.0135233905,
 53: 0.23621812,
 54: 0.15916708,
 55: 0.13304028,
 56: 0.023936536,
 57: -0.004681714,
 58: 0

Now let's find the top 10 similar artists and bands.

In [16]:
top_n_docs = sorted(doc_to_similarity, key=doc_to_similarity.get, reverse=True)[:10]

result = []
for doc_number in top_n_docs:
    result.append({
        'similarity': similar_entities[doc_number],
        'artist_name': mapping[doc_number]['artist_name']
    })
    
pprint(result)

[{'artist_name': 'Scrape', 'similarity': 1.0},
 {'artist_name': 'Thy Flesh Consumed', 'similarity': 0.8369814},
 {'artist_name': 'Esoteria', 'similarity': 0.8077111},
 {'artist_name': 'Namland', 'similarity': 0.79760706},
 {'artist_name': 'john vanderslice', 'similarity': 0.7870548},
 {'artist_name': 'devastations', 'similarity': 0.78149366},
 {'artist_name': 'The Prototype', 'similarity': 0.77928066},
 {'artist_name': 'Loud Crowd', 'similarity': 0.77628314},
 {'artist_name': 'Warthrone', 'similarity': 0.76615566},
 {'artist_name': 'Sexual Disfunction', 'similarity': 0.763816}]


## Demo

### 🤖 `I am a semi professional ML implementation AMA`

In [17]:
def find_similar_artists(artist_name):
    # <Machine Learning Magic World 🧙‍♂️>
    query_bio = clean_text(get_artist_bio(artist_name))
    vec_bow = my_model.dictionary.doc2bow(query_bio)
    vec_lsi = my_model.lsi_model[my_model.tfidf_model[vec_bow]]
    similar_entities = my_model.index[vec_lsi]
    # </Machine Learning Magic World 🧙‍♂️>
    doc_to_similarity = dict(enumerate(similar_entities))
    top_n_docs = sorted(doc_to_similarity, key=doc_to_similarity.get, reverse=True)[:10]

    result = []
    for doc_number in top_n_docs:
        result.append({
            'similarity': similar_entities[doc_number],
            'artist_name': mapping[doc_number]['artist_name']
        })
    sorted(result, key=lambda d: d['similarity'])
    pprint(result)

In [18]:
find_similar_artists("Eminem")

[{'artist_name': 'Dark Haven', 'similarity': 1.0},
 {'artist_name': 'darediablo', 'similarity': 0.6937217},
 {'artist_name': 'Lizzy McAlpine', 'similarity': 0.6332766},
 {'artist_name': 'Theatre of Tragedy', 'similarity': 0.5827793},
 {'artist_name': 'patrick phelan', 'similarity': 0.5790529},
 {'artist_name': 'Naisian', 'similarity': 0.57903266},
 {'artist_name': 'Wangelen', 'similarity': 0.5718372},
 {'artist_name': 'Sexual Disfunction', 'similarity': 0.5636923},
 {'artist_name': 'Dog Eats Flesh', 'similarity': 0.56205755},
 {'artist_name': 'Night Drive', 'similarity': 0.5615702}]


In [19]:
def find_similar_artists_by_prose(prose: str):
    clean_prose = clean_text(prose)
    # <Machine Learning Magic World 🧙‍♂️>
    vec_bow = my_model.dictionary.doc2bow(clean_prose)
    vec_lsi = my_model.lsi_model[my_model.tfidf_model[vec_bow]]
    similar_entities = my_model.index[vec_lsi]
    # </Machine Learning Magic World 🧙‍♂️>
    doc_to_similarity = dict(enumerate(similar_entities))
    top_n_docs = sorted(doc_to_similarity, key=doc_to_similarity.get, reverse=True)[:10]

    result = []
    for doc_number in top_n_docs:
        result.append({
            'similarity': similar_entities[doc_number],
            'artist_name': mapping[doc_number]['artist_name']
        })
    sorted(result, key=lambda d: d['similarity'])
    pprint(result)

In [20]:
prose = "old school rap music from new york"
find_similar_artists_by_prose(prose)

[{'artist_name': 'Jamaram', 'similarity': 0.58269525},
 {'artist_name': 'Fabrizio Paterlini', 'similarity': 0.5650015},
 {'artist_name': 'jack oblivian', 'similarity': 0.5218149},
 {'artist_name': 'Infernum', 'similarity': 0.50152487},
 {'artist_name': 'Big Audio Dynamite', 'similarity': 0.48674136},
 {'artist_name': 'Vulnerable', 'similarity': 0.485658},
 {'artist_name': 'In Peccatum', 'similarity': 0.4850601},
 {'artist_name': 'Day Fly', 'similarity': 0.47404283},
 {'artist_name': 'Prescription Pain', 'similarity': 0.46799773},
 {'artist_name': 'Skepsis', 'similarity': 0.4635574}]


## Open Questions

* How to update model without reading all data again?
* Do too many artists from one genre sabotage the result?


## Problems That I faced

* I did not save artists without bios. So everytime my data gathering broke, the rerunning script had to ask for all those artists again. 
* Artists that were duplicates but with uppercase/lowercase could not be added again to git index. Git does not seem to be the best blob storage.