# Topic Modelling

From someone who does not understand Machine Learning.

In [34]:
import gensim
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LsiModel
from gensim.similarities import MatrixSimilarity

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [35]:
# nltk.download('stopwords')
# nltk.download('punkt')

### Utils

#### Serialize

In [39]:
import pickle


def pickle_save(obj, path):
     with open(path, 'wb') as out:
        pickle.dump(obj, out, protocol=pickle.HIGHEST_PROTOCOL)
            

def pickle_load(path):
     with open(path, 'rb') as obj:
        return pickle.load(obj)

#### Text Processing

In [42]:
import re


def _remove_html_tags(text):
    cleaner = re.compile('<.*?>')
    return re.sub(cleaner, '', text)


def _remove_control_chars(text):
    text = text.replace('\n','')
    text = text.replace('\t','')
    return text


def _remove_stop_words(text):
    stop_words = list()
    stop_words.extend(stopwords.words('english'))

    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    result = []
    for word in words:
        if word not in stop_words:
            result.append(word)
    return list(result)


def clean_text(text):
    text = _remove_html_tags(text)
    text = _remove_control_chars(text)
    text = _remove_stop_words(text)
    return text

#### Gather Data

In [49]:
def get_artist_bio(artist_name: str):
    return clean_text(artist_name + " data")

# Buckle Up!

In [44]:
FROM_SCRATCH = True
MODEL_PATH = 'model.pickle'
MAPPING_PATH = 'mapping.pickle'

In [96]:
!ls  # With ! you can run bash commands in Jupyter

README.md                      requirements.txt
mapping.pickle                 topic-modelling-workshop.ipynb
model.pickle                   [1m[36mvenv[m[m


## Data Cleansing and Preparation

We create so called documents that consist of the artist name and their biography in tokenized format.

The mapping looks something like this

```
{
    1: {'artist_name': '50 Cent', 'bio': ['a', 'text', 'about', '50', 'Cent'},
    2: {'artist_name': 'Eminem', 'bio': ['a', 'text', 'about', 'Eminem'},
    3: {'artist_name': 'Metallica', 'bio': ['a', 'text', 'about', 'Metallica'},
    ...
}
```

### Input Data

In [98]:
artist_names = [
    '50 Cent',
    'Eminem',
    'Metallica',
    'Obie Trice'
]

print(artist_names[:25])

['50 Cent', 'Eminem', 'Metallica', 'Obie Trice']


### Load Artists Bios and Create Mapping

In [63]:
if FROM_SCRATCH:
    mapping = {}
    for doc_number, artist_name in enumerate(artist_names):
        bio = get_artist_bio(artist_name)
        if bio:
            mapping[doc_number] = {
                'artist_name': artist_name,
                'bio': bio,
            }
    pickle_save(mapping, MAPPING_PATH)
else:
    mapping = pickle_load(MAPPING_PATH)

In [64]:
print(mapping)

{0: {'artist_name': '50 Cent', 'bio': ['50', 'Cent', 'data']},
 1: {'artist_name': 'Eminem', 'bio': ['Eminem', 'data']},
 2: {'artist_name': 'Metallica', 'bio': ['Metallica', 'data']},
 3: {'artist_name': 'Obie Trice', 'bio': ['Obie', 'Trice', 'data']}}


## Create model

#### <Machine Learning Magic World 🧙‍♂️>

The following code is not there to be understood. Thanks. `¯\_(ツ)_/¯`

In [65]:
from dataclasses import dataclass


@dataclass
class MyModel:
    dictionary: Dictionary
    tfidf_model: TfidfModel
    lsi_model: LsiModel
    index: MatrixSimilarity
    

if FROM_SCRATCH:
    dataset = [mapping[i]['bio'] for i in mapping]
    
    dct = Dictionary(dataset)

    corpus = [dct.doc2bow(line) for line in dataset]
    tfidf_model = TfidfModel(corpus)
    tfidf_corpus = tfidf_model[corpus]

    lsi_model = LsiModel(tfidf_corpus, id2word=dct, num_topics=50, power_iters=4)

    index = MatrixSimilarity(lsi_model[tfidf_corpus])
    my_model = MyModel(dictionary=dct,
                       tfidf_model=tfidf_model,
                       lsi_model=lsi_model,
                       index=index)
    pickle_save(my_model, MODEL_PATH)
else:
    my_model = pickle_load(MODEL_PATH)




#### </Machine Learning Magic World 🧙‍♂️>

## Query the Model

Create the query entity data as we did when creating the model

In [87]:
query_bio = get_entity_data('Obie Trice')

#### <Machine Learning Magic World 🧙‍♂️>

The following code is not there to be understood. Thanks. `¯\_(ツ)_/¯`

In [88]:
vec_bow = my_model.dictionary.doc2bow(query_bio)
vec_lsi = my_model.lsi_model[my_model.tfidf_model[vec_bow]]
similar_entities = my_model.index[vec_lsi]

#### </Machine Learning Magic World 🧙‍♂️>

## The Result

We got all the doc numbers (see `mapping`) and how good they match with the queried artist (0.0 to 1.0). 

In [89]:
doc_number_to_propability = dict(enumerate(similar_entities))

print(doc_number_to_propability)

{0: 0.0, 1: 0.0, 2: 2.9802322e-08, 3: 1.0}


Now let's find the top 10 matches.

In [90]:
top_n_docs = sorted(d, key=d.get, reverse=True)[:10]

result = []
for doc_number in top_n_docs:
    result.append({
        'propability': similar_entities[doc_number],
        'artist_name': mapping[doc_number]['artist_name']
    })
    
print(result)

[{'artist_name': 'Obie Trice', 'propability': 1.0},
 {'artist_name': 'Metallica', 'propability': 2.9802322e-08},
 {'artist_name': '50 Cent', 'propability': 0.0},
 {'artist_name': 'Eminem', 'propability': 0.0}]


## Demo

### 🤖 `I am a semi professional ML implementation AMA`

In [99]:
def find_similar_artists(artist_name):
    query_bio = get_entity_data(artist_name)
    vec_bow = my_model.dictionary.doc2bow(query_bio)
    vec_lsi = my_model.lsi_model[my_model.tfidf_model[vec_bow]]
    similar_entities = my_model.index[vec_lsi]
    doc_number_to_propability = dict(enumerate(similar_entities))
    top_n_docs = sorted(d, key=d.get, reverse=True)[:10]

    result = []
    for doc_number in top_n_docs:
        result.append({
            'propability': similar_entities[doc_number],
            'artist_name': mapping[doc_number]['artist_name']
        })
    print(result)

In [100]:
find_similar_artists("Rihanna")

[{'artist_name': 'Obie Trice', 'propability': 0.0},
 {'artist_name': 'Metallica', 'propability': 0.0},
 {'artist_name': '50 Cent', 'propability': 0.0},
 {'artist_name': 'Eminem', 'propability': 0.0}]
