In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.mixture import GaussianMixture
import joblib

# read the data

In [2]:
data = pd.read_csv('book_list.csv',encoding = "ISO-8859-1", header=0)
data.dropna(inplace=True)
print (data.shape)
print (data.head())

(25353, 2)
                              title            author
0                       The Martian         Andy Weir
1  Career of Evil (Cormoran Strike)  Robert Galbraith
2              The Crossing (Bosch)  Michael Connelly
3           The Guilty (Will Robie)    David Baldacci
4         Ready Player One: A Novel      Ernest Cline


# load the stopwords

In [3]:
sw_file = 'stopwords.txt'
with open(sw_file, 'r') as fr:
    stopwords = set(map(lambda x: x.strip(), fr.readlines()))
print (len(stopwords))
print (stopwords)

319
{'whereafter', 'be', 'afterwards', 'do', 'four', 'within', 'hundred', 'became', 'beside', 'yourself', 'anything', 'whole', 'empty', 'through', 'how', 'two', 'same', 'third', 'formerly', 'yours', 'should', 'we', 'anyhow', 'whenever', 'ltd', 'why', 'he', 'before', 'amount', 'been', 'or', 'enough', 'into', 'whereas', 'behind', 'had', 'latterly', 'full', 'get', 'only', 'five', 'this', 'still', 'was', 'others', 'else', 'whom', 'thereby', 'being', 'fill', 'eg', 'interest', 'myse"', 'whoever', 'sometime', 'about', 'is', 'mostly', 'hence', 'hereby', 'fifteen', 'latter', 'under', 'up', 'anyway', 'along', 'many', 'them', 'thus', 'another', 'go', 'whither', 'since', 'where', 'becomes', 'hereupon', 'whether', 'thence', 'herse"', 'by', 'hereafter', 'himse"', 'but', 'the', 'against', 'it', 'towards', 'thereafter', 'she', 'ever', 'whatever', 'twenty', 'than', 'further', 'whereupon', 'must', 'to', 'upon', 'sincere', 'yourselves', 'am', 'seem', 'seeming', 'meanwhile', 'find', 'cannot', 'thereupon',

# data cleaning

### clean the titles

In [4]:
# is english character?
def is_valid(word):
    for uchar in word:
        if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):
            continue
        else:
            return False
    return True

In [5]:
# funtion to clean the title column
punc = '[~`\!\#\$\%^\&\*\'\(\)_\+\-\=\|\[\]\\/\:;\.,\?\>\<\@\"\{\}]'
def clean_title(line):
    result = []
    # remove the punctuation
    line = re.sub(re.compile(punc), '', line)
    words = line.strip().split()
    for word in words:
        word = word.lower()
        # exclude the stopwords or pure numbers
        if word in stopwords or word.isdigit():
            continue
        # if not a valid english text, return a special text for future removal
        if not is_valid(word):
            return 'this is invalid'
        result.append(word)
    return ' '.join(result)

In [6]:
col_title = data['title'].map(clean_title)

### clean the authors

turn to lower case

In [7]:
# function to clean the author column
def clean_author(line):
    result = []
    words = line.strip().split()
    for word in words:
        word = word.lower()
        # if not a valid english text, return a special text for future removal
        if not is_valid(word):
            return 'this is invalid'
        result.append(word)
    return ' '.join(result)

In [8]:
col_author = data['author'].map(clean_author)

### drop the lines with invalid text and duplicate lines

In [9]:
clean_df = pd.DataFrame({'title': col_title, 'author': col_author})
clean_df = clean_df[clean_df['title'] != 'this is invalid']
clean_df = clean_df[clean_df['author'] != 'this is invalid']
clean_df = clean_df.drop_duplicates()

In [10]:
print (clean_df.shape)
print (clean_df.head())

(18416, 2)
                         title            author
0                      martian         andy weir
1  career evil cormoran strike  robert galbraith
2               crossing bosch  michael connelly
3                 guilty robie    david baldacci
4           ready player novel      ernest cline


In [11]:
clean_df[clean_df['author'] == 'lewis carroll']

Unnamed: 0,title,author
4788,alice wonderland edition norton critical editions,lewis carroll
22454,alices adventures wonderland popup adaptation,lewis carroll


### concatenate the title and author together

In [12]:
one_df = pd.DataFrame(clean_df['title'] + ' ' + clean_df['author'], columns=['all'])

In [13]:
one_df.head()

Unnamed: 0,all
0,martian andy weir
1,career evil cormoran strike robert galbraith
2,crossing bosch michael connelly
3,guilty robie david baldacci
4,ready player novel ernest cline


# save the clean data

In [14]:
one_df.to_csv('clean.csv', index=False, encoding='utf-8')

# vectorize the books

In [15]:
vectorizer = TfidfVectorizer(min_df=3, max_features=5000, analyzer='word', token_pattern=r'\w+', use_idf=True, smooth_idf=True, dtype='float32')
vectors = vectorizer.fit_transform(one_df['all'])
vectors.shape



(18416, 5000)

In [16]:
joblib.dump(vectorizer, '../model/vectorizer.pkl')
print ("TFIDF vectorizer saved")

TFIDF vectorizer saved


# topic modelling

In [17]:
lda = LatentDirichletAllocation(n_components=10, learning_method='online')
vectors_topic = lda.fit_transform(vectors)
vectors_topic.shape

(18416, 10)

In [18]:
joblib.dump(lda, '../model/lda.pkl')
print ("LDA model saved")

LDA model saved


# books clustering

In [19]:
gmm = GaussianMixture(n_components=8, max_iter=100, random_state=2020)
gmm.fit(vectors_topic)
joblib.dump(gmm, '../model/gmm.pkl')
print ("Gaussian Mixture Model saved")

Gaussian Mixture Model saved


In [20]:
dist = gmm.predict_proba(vectors_topic)

In [21]:
dist[45,:]

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.96475978e-01,
       6.79285763e-08, 0.00000000e+00, 3.52395430e-03, 0.00000000e+00])