# Topic Modelling

From someone who does not understand Machine Learning.

In [None]:
import gensim
from gensim.corpora import Dictionary
from gensim.models import TfidfModel, LsiModel
from gensim.similarities import MatrixSimilarity

import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

In [None]:
# nltk.download('stopwords')
# nltk.download('punkt')

### Utils

#### Serialize

In [None]:
import pickle


def pickle_save(obj, path):
     with open(path, 'wb') as out:
        pickle.dump(obj, out, protocol=pickle.HIGHEST_PROTOCOL)
            

def pickle_load(path):
     with open(path, 'rb') as obj:
        return pickle.load(obj)

#### Text Processing

In [None]:
import re


def _remove_html_tags(text):
    cleaner = re.compile('<.*?>')
    return re.sub(cleaner, '', text)


def _remove_control_chars(text):
    text = text.replace('\n','')
    text = text.replace('\t','')
    return text


def _remove_stop_words(text):
    stop_words = list()
    stop_words.extend(stopwords.words('english'))

    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    result = []
    for word in words:
        if word not in stop_words:
            result.append(word)
    return list(result)    


def clean_text(text):
    text = _remove_html_tags(text)
    text = _remove_control_chars(text)
    text = _remove_stop_words(text)
    return text

#### Gather Data

In [None]:
# tbd.

# Buckle Up!

In [None]:
FROM_SCRATCH = True
ARTIST_NAMES_PATH = 'data/artists-bio.csv'
MODEL_PATH = 'model.pickle'
MAPPING_PATH = 'mapping.pickle'

In [None]:
!tree -I venv -I data/raw/bios # With ! you can run bash commands in Jupyter

## Data Cleansing and Preparation

We create so called documents that consist of the artist name and their biography in tokenized format.

The mapping looks something like this

```
{
    1: {'artist_name': '50 Cent', 'bio': ['a', 'text', 'about', '50', 'Cent'},
    2: {'artist_name': 'Eminem', 'bio': ['a', 'text', 'about', 'Eminem'},
    3: {'artist_name': 'Metallica', 'bio': ['a', 'text', 'about', 'Metallica'},
    ...
}
```

### Input Data

In [None]:
# Load CSV
# Clean bios

In [None]:
print(mapping)

## Create model

#### <Machine Learning Magic World 🧙‍♂️>

The following code is not there to be understood. Thanks. `¯\_(ツ)_/¯`

In [None]:
from dataclasses import dataclass


@dataclass
class MyModel:
    dictionary: Dictionary
    tfidf_model: TfidfModel
    lsi_model: LsiModel
    index: MatrixSimilarity
    

if FROM_SCRATCH:
    dataset = [mapping[i]['bio'] for i in mapping]
    
    dct = Dictionary(dataset)

    corpus = [dct.doc2bow(line) for line in dataset]
    tfidf_model = TfidfModel(corpus)
    tfidf_corpus = tfidf_model[corpus]

    lsi_model = LsiModel(tfidf_corpus, id2word=dct, num_topics=50, power_iters=4)

    index = MatrixSimilarity(lsi_model[tfidf_corpus])
    my_model = MyModel(dictionary=dct,
                       tfidf_model=tfidf_model,
                       lsi_model=lsi_model,
                       index=index)
    pickle_save(my_model, MODEL_PATH)
else:
    my_model = pickle_load(MODEL_PATH)




#### </Machine Learning Magic World 🧙‍♂️>

## Query the Model

Create the query entity data as we did when creating the model

In [None]:
query_bio = get_artist_bio('Obie Trice')

#### <Machine Learning Magic World 🧙‍♂️>

The following code is not there to be understood. Thanks. `¯\_(ツ)_/¯`

In [None]:
vec_bow = my_model.dictionary.doc2bow(query_bio)
vec_lsi = my_model.lsi_model[my_model.tfidf_model[vec_bow]]
similar_entities = my_model.index[vec_lsi]

#### </Machine Learning Magic World 🧙‍♂️>

## The Result

We got all the doc numbers (see `mapping`) and how similar they are to the queried artist (0.0 to 1.0). 

In [None]:
doc_to_similarity = dict(enumerate(similar_entities))

print(doc_to_similarity)

Now let's find the top 10 similar artists and bands.

In [None]:
top_n_docs = sorted(doc_to_similarity, key=doc_to_similarity.get, reverse=True)[:10]

result = []
for doc_number in top_n_docs:
    result.append({
        'similarity': similar_entities[doc_number],
        'artist_name': mapping[doc_number]['artist_name']
    })
    
print(result)

## Demo

### 🤖 `I am a semi professional ML implementation AMA`

In [None]:
from pprint import pprint


def find_similar_artists(artist_name):
    # <Machine Learning Magic World 🧙‍♂️>
    query_bio = get_artist_bio(artist_name)
    vec_bow = my_model.dictionary.doc2bow(query_bio)
    vec_lsi = my_model.lsi_model[my_model.tfidf_model[vec_bow]]
    similar_entities = my_model.index[vec_lsi]
    # </Machine Learning Magic World 🧙‍♂️>
    doc_to_similarity = dict(enumerate(similar_entities))
    top_n_docs = sorted(doc_to_similarity, key=doc_to_similarity.get, reverse=True)[:10]

    result = []
    for doc_number in top_n_docs:
        result.append({
            'similarity': similar_entities[doc_number],
            'artist_name': mapping[doc_number]['artist_name']
        })
    sorted(result, key=lambda d: d['similarity'])
    pprint(result)

In [None]:
find_similar_artists("Obie Trice")

## Open Questions

* How to update model without reading all data again?
* 