In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import NearestNeighbors
import joblib

# read the data

In [2]:
data = pd.read_csv('book_list.csv',encoding = "ISO-8859-1", header=0)
data.dropna(inplace=True)
print (data.shape)
print (data.head())

(25353, 2)
                              title            author
0                       The Martian         Andy Weir
1  Career of Evil (Cormoran Strike)  Robert Galbraith
2              The Crossing (Bosch)  Michael Connelly
3           The Guilty (Will Robie)    David Baldacci
4         Ready Player One: A Novel      Ernest Cline


# load the stopwords

In [3]:
sw_file = 'stopwords.txt'
with open(sw_file, 'r') as fr:
    stopwords = set(map(lambda x: x.strip(), fr.readlines()))
print (len(stopwords))
print (stopwords)

319
{'name', 'nine', 'she', 'each', 'and', 'whose', 'seems', 'who', 'hereupon', 'via', 'have', 'therefore', 'too', 'please', 'an', 'his', 'whence', 'serious', 'were', 'also', 'four', 'eight', 'between', 'whereupon', 'same', 'so', 'into', 'alone', 'though', 'else', 'nowhere', 'one', 'already', 'thereafter', 'yourself', 'without', 'side', 'any', 'anything', 'on', 'which', 'couldnt', 'etc', 'yourselves', 'such', 'although', 'become', 'your', 'nobody', 'sometimes', 'at', 'indeed', 'across', 'us', 'fill', 'get', 'they', 'being', 'whereafter', 'may', 'back', 'amoungst', 'these', 'off', 'mill', 'many', 'this', 'two', 'was', 'its', 'around', 'move', 'there', 'of', 'either', 'we', 'below', 'enough', 'however', 'anyone', 'rather', 'nor', 'sixty', 'describe', 'besides', 'hereafter', 'here', 'hereby', 'ourselves', 'namely', 'further', 'ltd', 'least', 'found', 'ie', 'meanwhile', 'beside', 'most', 'than', 'whither', 'eleven', 'ten', 'anywhere', 'them', 'my', 'now', 'very', 'can', 'our', 'with', 'cal

# data cleaning

### clean the titles

In [4]:
# is english character?
def is_valid(word):
    for uchar in word:
        if (uchar >= u'\u0041' and uchar<=u'\u005a') or (uchar >= u'\u0061' and uchar<=u'\u007a'):
            continue
        else:
            return False
    return True

In [5]:
# funtion to clean the title column
punc = '[~`\!\#\$\%^\&\*\'\(\)_\+\-\=\|\[\]\\/\:;\.,\?\>\<\@\"\{\}]'
def clean_title(line):
    result = []
    # remove the punctuation
    line = re.sub(re.compile(punc), '', line)
    words = line.strip().split()
    for word in words:
        word = word.lower()
        # exclude the stopwords or pure numbers
        if word in stopwords or word.isdigit():
            continue
        # if not a valid english text, return a special text for future removal
        if not is_valid(word):
            return 'this is invalid'
        result.append(word)
    return ' '.join(result)

In [6]:
col_title = data['title'].map(clean_title)

### clean the authors

turn to lower case

In [7]:
# function to clean the author column
def clean_author(line):
    result = []
    words = line.strip().split()
    for word in words:
        word = word.lower()
        # if not a valid english text, return a special text for future removal
        if not is_valid(word):
            return 'this is invalid'
        result.append(word)
    return ' '.join(result)

In [8]:
col_author = data['author'].map(clean_author)

### drop the lines with invalid text and duplicate lines

In [9]:
clean_df = pd.DataFrame({'clean_title': col_title, 'clean_author': col_author, 'title': data['title'], 'author': data['author']})
clean_df = clean_df[clean_df['clean_title'] != 'this is invalid']
clean_df = clean_df[clean_df['clean_author'] != 'this is invalid']
clean_df = clean_df.drop_duplicates(subset=['clean_title', 'clean_author'])

In [10]:
print (clean_df.shape)
print (clean_df.head())

(18416, 4)
                   clean_title      clean_author  \
0                      martian         andy weir   
1  career evil cormoran strike  robert galbraith   
2               crossing bosch  michael connelly   
3                 guilty robie    david baldacci   
4           ready player novel      ernest cline   

                              title            author  
0                       The Martian         Andy Weir  
1  Career of Evil (Cormoran Strike)  Robert Galbraith  
2              The Crossing (Bosch)  Michael Connelly  
3           The Guilty (Will Robie)    David Baldacci  
4         Ready Player One: A Novel      Ernest Cline  


In [11]:
clean_df[clean_df['clean_author'] == 'lewis carroll']

Unnamed: 0,clean_title,clean_author,title,author
4788,alice wonderland edition norton critical editions,lewis carroll,Alice in Wonderland (Third Edition) (Norton C...,Lewis Carroll
22454,alices adventures wonderland popup adaptation,lewis carroll,Alice's Adventures in Wonderland: A Pop-up Ada...,Lewis Carroll


### concatenate the title and author together

In [12]:
one_df = pd.DataFrame(clean_df['clean_title'] + ' ' + clean_df['clean_author'], columns=['all'])

In [13]:
one_df.head()

Unnamed: 0,all
0,martian andy weir
1,career evil cormoran strike robert galbraith
2,crossing bosch michael connelly
3,guilty robie david baldacci
4,ready player novel ernest cline


# vectorize the books

In [14]:
vectorizer = TfidfVectorizer(min_df=2, analyzer='word', token_pattern=r'\w+', use_idf=True, smooth_idf=True, dtype='float32')
vectors = vectorizer.fit_transform(one_df['all'])
vectors.shape



(18416, 12357)

In [15]:
joblib.dump(vectorizer, '../model/vectorizer.pkl')
print ("TFIDF vectorizer saved")

TFIDF vectorizer saved


# topic modelling

In [29]:
lda = LatentDirichletAllocation(n_components=30, learning_method='online')
vectors_topic = lda.fit_transform(vectors)
vectors_topic.shape

(18416, 30)

In [30]:
joblib.dump(lda, '../model/lda.pkl')
print ("LDA model saved")

LDA model saved


# nearest neighbour model

In [31]:
knn = NearestNeighbors()
knn.fit(vectors_topic)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

In [32]:
joblib.dump(knn, '../model/knn.pkl')
print ("KNN model saved")

KNN model saved


# books clustering

In [39]:
gmm = GaussianMixture(n_components=8, n_init=3, max_iter=100, random_state=2020)
gmm.fit(vectors_topic)
joblib.dump(gmm, '../model/gmm.pkl')
print ("Gaussian Mixture Model saved")

Gaussian Mixture Model saved


In [40]:
dist = gmm.predict_proba(vectors_topic)

# get the max groups

In [41]:
# how to decide groups according to probability distribution?
def into_groups(distribution):
    max_p = np.max(distribution)
    groups = []
    if max_p > 0.9:
        # if max proba > 0.9, return the max group
        groups = [np.argmax(distribution)]
    elif max_p < 0.4:
        # if max proba < 0.4, not belong to any group, return a new group
        groups = [len(distribution)]
    else:
        # if max proba is not dominant, return all the groups with proba > 0.1
        ind = np.argsort(-distribution)
        n_groups = len(distribution[distribution > 0.1])
        groups = ind[:n_groups].tolist()
    return ','.join(map(str, groups))

In [42]:
groups = []
maxgroup = []
for i in range(dist.shape[0]):
    group = into_groups(dist[i,:])
    groups.append(group)
    maxgroup.append(group.split(',')[0])

# save the original data for application

In [43]:
final_df = pd.DataFrame({'title': clean_df['title'], 'author': clean_df['author'], 'groups': groups, 'maxgroup': maxgroup})

In [44]:
final_df.to_csv('archive.csv', index=False, encoding='utf-8')