# Topic Modelling

From someone who does not understand Machine Learning.

In [1]:
import gensim
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LsiModel
from gensim.similarities import MatrixSimilarity

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [2]:
# nltk.download('stopwords')
# nltk.download('punkt')

### Utils

#### Serialize

In [3]:
import pickle


def pickle_save(obj, path):
     with open(path, 'wb') as out:
        pickle.dump(obj, out, protocol=pickle.HIGHEST_PROTOCOL)
            

def pickle_load(path):
     with open(path, 'rb') as obj:
        return pickle.load(obj)

#### Text Processing

In [4]:
import re


def _remove_html_tags(text):
    cleaner = re.compile('<.*?>')
    return re.sub(cleaner, '', text)


def _remove_control_chars(text):
    text = text.replace('\n','')
    text = text.replace('\t','')
    return text


def _remove_stop_words(text):
    stop_words = list()
    stop_words.extend(stopwords.words('english'))

    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    result = []
    for word in words:
        if word not in stop_words:
            result.append(word)
    return list(result)


def clean_text(text):
    text = _remove_html_tags(text)
    text = _remove_control_chars(text)
    text = _remove_stop_words(text)
    return text

#### Gather Data

In [36]:
def load_artist_names(path):
    with open(path, 'r') as f:
        return [artist_name.strip() for artist_name in f.readlines()]


def get_artist_bio(artist_name: str):
    return clean_text(artist_name + " data")

# Buckle Up!

In [37]:
FROM_SCRATCH = True
ARTIST_NAMES_PATH = 'data/artist_names.txt'
MODEL_PATH = 'model.pickle'
MAPPING_PATH = 'mapping.pickle'

In [38]:
!tree -I venv  # With ! you can run bash commands in Jupyter

[01;34m.[0m
├── [00mREADME.md[0m
├── [01;34mdata[0m
│   └── [00martist_names.txt[0m
├── [00mmapping.pickle[0m
├── [00mmodel.pickle[0m
├── [00mrequirements.txt[0m
└── [00mtopic-modelling-workshop.ipynb[0m

1 directory, 6 files


## Data Cleansing and Preparation

We create so called documents that consist of the artist name and their biography in tokenized format.

The mapping looks something like this

```
{
    1: {'artist_name': '50 Cent', 'bio': ['a', 'text', 'about', '50', 'Cent'},
    2: {'artist_name': 'Eminem', 'bio': ['a', 'text', 'about', 'Eminem'},
    3: {'artist_name': 'Metallica', 'bio': ['a', 'text', 'about', 'Metallica'},
    ...
}
```

### Input Data

In [44]:
artist_names = load_artist_names(ARTIST_NAMES_PATH)
print(artist_names[234:567])

['Anthony Danza', 'Anthony Hamilton', 'Anti Lilly', 'Antologie', 'Antonín Dvořák', 'Antwon', 'Antônio Carlos Jobim', 'Aperture Science Psychoacoustic Laboratories', 'Aphex Twin', 'Apocalyspe', 'Apollo', 'Apollo Brown', 'Apparat', 'April George', 'Aquarius Heaven', 'Aquilo', 'Arca', 'Arcade Fire', 'Arcangel', 'Archie Daggers & Tender Slider', 'Archie Lee', 'Architechs', 'Archy Marshall', 'Arctic Lake', 'Ardian Bujupi', 'Area', 'Aretha Franklin', 'Ari Lennox', 'Ari Rasilainen', 'Ariana Grande', 'Arianoknows', 'Arin Ray', 'Armageddon', 'Armando', 'Armin Rohde', 'Art Nap', 'Art Of Trance', 'Artem Kacher', 'Artful Dodger', 'Arthur Conley', 'Arthur Ross', 'Arthur Rubinstein', 'Arthur Russell', 'Artik & Asti', 'Asata', 'Ascent', 'Ash-Rock', 'Ashanti', 'Asher Roth', 'Ashley All Day', 'Assassin', 'Astero', 'Astrid', 'Astrid S', 'Astronaut Husband', 'Astronomyy', 'Astrud Gilberto', 'Athena Cage', 'Atlas Bound', 'Atmos T', 'Atom', 'Au/Ra', 'Audio Push', 'Audio88', 'August Alsina', 'August Rigo', 

### Load Artists Bios and Create Mapping

In [9]:
if FROM_SCRATCH:
    mapping = {}
    for doc_number, artist_name in enumerate(artist_names):
        bio = get_artist_bio(artist_name)
        if bio:
            mapping[doc_number] = {
                'artist_name': artist_name,
                'bio': bio,
            }
    pickle_save(mapping, MAPPING_PATH)
else:
    mapping = pickle_load(MAPPING_PATH)

In [10]:
print(mapping)

{0: {'artist_name': '50 Cent', 'bio': ['50', 'Cent', 'data']}, 1: {'artist_name': 'Eminem', 'bio': ['Eminem', 'data']}, 2: {'artist_name': 'Metallica', 'bio': ['Metallica', 'data']}, 3: {'artist_name': 'Obie Trice', 'bio': ['Obie', 'Trice', 'data']}}


## Create model

#### <Machine Learning Magic World 🧙‍♂️>

The following code is not there to be understood. Thanks. `¯\_(ツ)_/¯`

In [11]:
from dataclasses import dataclass


@dataclass
class MyModel:
    dictionary: Dictionary
    tfidf_model: TfidfModel
    lsi_model: LsiModel
    index: MatrixSimilarity
    

if FROM_SCRATCH:
    dataset = [mapping[i]['bio'] for i in mapping]
    
    dct = Dictionary(dataset)

    corpus = [dct.doc2bow(line) for line in dataset]
    tfidf_model = TfidfModel(corpus)
    tfidf_corpus = tfidf_model[corpus]

    lsi_model = LsiModel(tfidf_corpus, id2word=dct, num_topics=50, power_iters=4)

    index = MatrixSimilarity(lsi_model[tfidf_corpus])
    my_model = MyModel(dictionary=dct,
                       tfidf_model=tfidf_model,
                       lsi_model=lsi_model,
                       index=index)
    pickle_save(my_model, MODEL_PATH)
else:
    my_model = pickle_load(MODEL_PATH)




#### </Machine Learning Magic World 🧙‍♂️>

## Query the Model

Create the query entity data as we did when creating the model

In [12]:
query_bio = get_artist_bio('Obie Trice')

#### <Machine Learning Magic World 🧙‍♂️>

The following code is not there to be understood. Thanks. `¯\_(ツ)_/¯`

In [13]:
vec_bow = my_model.dictionary.doc2bow(query_bio)
vec_lsi = my_model.lsi_model[my_model.tfidf_model[vec_bow]]
similar_entities = my_model.index[vec_lsi]

#### </Machine Learning Magic World 🧙‍♂️>

## The Result

We got all the doc numbers (see `mapping`) and how similar they are to the queried artist (0.0 to 1.0). 

In [14]:
doc_to_similarity = dict(enumerate(similar_entities))

print(doc_to_similarity)

{0: 0.0, 1: 0.0, 2: 0.0, 3: 0.99999994}


Now let's find the top 10 similar artists and bands.

In [19]:
top_n_docs = sorted(doc_to_similarity, key=doc_to_similarity.get, reverse=True)[:10]

result = []
for doc_number in top_n_docs:
    result.append({
        'similarity': similar_entities[doc_number],
        'artist_name': mapping[doc_number]['artist_name']
    })
    
print(result)

[{'similarity': 0.99999994, 'artist_name': 'Obie Trice'}, {'similarity': 0.0, 'artist_name': '50 Cent'}, {'similarity': 0.0, 'artist_name': 'Eminem'}, {'similarity': 0.0, 'artist_name': 'Metallica'}]


## Demo

### 🤖 `I am a semi professional ML implementation AMA`

In [45]:
from pprint import pprint


def find_similar_artists(artist_name):
    # <Machine Learning Magic World 🧙‍♂️>
    query_bio = get_artist_bio(artist_name)
    vec_bow = my_model.dictionary.doc2bow(query_bio)
    vec_lsi = my_model.lsi_model[my_model.tfidf_model[vec_bow]]
    similar_entities = my_model.index[vec_lsi]
    # </Machine Learning Magic World 🧙‍♂️>
    doc_to_similarity = dict(enumerate(similar_entities))
    top_n_docs = sorted(doc_to_similarity, key=doc_to_similarity.get, reverse=True)[:10]

    result = []
    for doc_number in top_n_docs:
        result.append({
            'similarity': similar_entities[doc_number],
            'artist_name': mapping[doc_number]['artist_name']
        })
    sorted(result, key=lambda d: d['similarity'])
    pprint(result)

In [46]:
find_similar_artists("Obie Trice")

[{'artist_name': 'Obie Trice', 'similarity': 0.99999994},
 {'artist_name': '50 Cent', 'similarity': 0.0},
 {'artist_name': 'Eminem', 'similarity': 0.0},
 {'artist_name': 'Metallica', 'similarity': 0.0}]
