<a href="https://colab.research.google.com/github/mabela1/Similarity-Classifier/blob/main/Similarity_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Import dependecies

In [None]:
import pandas as pd
import numpy as np
import math
import string
import PyPDF2
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from nltk import sent_tokenize
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk import tokenize

In [None]:
pdf = open('Commercial_Security_System_Market.pdf','rb')

In [None]:
def text_extraction(pdf):
    pdfReader = PyPDF2.PdfFileReader(pdf)
    count = pdfReader.numPages
    output = ''

    for i in range(count):
        page = pdfReader.getPage(i)
        output += (page.extractText())
    return output

In [None]:
text = text_extraction(pdf)

Inputs

In [None]:
categories = ['work','technology','nature','health','smart','security','market','economy']

In [None]:
def clean_sentences(sentences):
    stopWords = set(stopwords.words("english"))
    lemmatizer = WordNetLemmatizer()
    clean_document_sent = []

    for sent in sentences:
        words = word_tokenize(sent)
        clean_sentence = []
        for word in words:
            word = word.lower()
            word = lemmatizer.lemmatize(word)
            # remove punctuation from each word
            table = str.maketrans('','',string.punctuation)
            word = word.translate(table)
            if word.isalpha() == False:
                continue
            if word in stopWords:
                continue
            if (len(word)<4):
                continue
            clean_sentence.append(word)
        clean_document_sent.append(clean_sentence)
    return clean_document_sent

In [None]:
def tf_score(doc_word_clean):
    tf_score = {}
    for each_word in doc_word_clean:
        if each_word in tf_score:
            tf_score[each_word] += 1
        else:
            tf_score[each_word] = 1

    # Dividing by total_word_length for each dictionary element
    total_word_length = len(doc_word_clean)
    tf_score.update((x, y/int(total_word_length)) for x, y in tf_score.items())
    return tf_score

In [None]:
def check_sent(word, sentences): 
    final = [all([w in x for w in word]) for x in sentences] 
    sent_len = [sentences[i] for i in range(0, len(final)) if final[i]]
    return int(len(sent_len))

In [None]:
def idf_calc_score(doc_word_clean,transform_sentences):
    idf_score = {}
    for each_word in doc_word_clean:
        each_word = each_word.replace('.','')
        if each_word in idf_score:
            idf_score[each_word] = check_sent(each_word, transform_sentences)
        else:
            idf_score[each_word] = 1
    
    # Performing a log and divide
    total_sent_len = len(transform_sentences)
    idf_score.update((x, math.log(int(total_sent_len)/y)) for x, y in idf_score.items())
    return idf_score

In [None]:
# select top key words
from operator import itemgetter
def get_top_n(dict_elem, n):
    result = dict(sorted(dict_elem.items(), key = itemgetter(1), reverse = True)[:n]) 
    return result

In [None]:
from sentence_transformers import SentenceTransformer
model_bert = SentenceTransformer('paraphrase-mpnet-base-v2')
def vector_pdf_embedding(key_words_new,categories):
    # calculate weights for key words
    key_words_weights = np.array(list(key_words_new.values()))/np.sum(np.array(list(key_words_new.values())))
    # generate embeddings for each key word from top n list
    embeddings_key_words = model_bert.encode(list(key_words_new.keys()))
    
    # generate embeddings for each category
    embeddings_categories = model_bert.encode(categories)

    # generate a embedding vector for pdf based on top key words
    vector_pdf = np.dot(key_words_weights,embeddings_key_words)
    return vector_pdf,embeddings_categories

Create the Model

In [None]:
def similarity_classifier(text,nr_key_words,categories):
    # clean text
    clean_document_sent = clean_sentences(tokenize.sent_tokenize(text))
    clean_document_word = [item for sublist in clean_document_sent for item in sublist]
    
    #calculate tf_score
    tf = tf_score(clean_document_word)
    
    # tranform sentences before calling idf function
    transform_sentences = []
    for i in clean_document_sent:
        transform_sentences.append(' '.join(i))
    # calculate idf_score
    idf = idf_calc_score(clean_document_word,transform_sentences)
    
    #calculate tf-idf_score
    tf_idf = {key: tf[key] * idf.get(key, 0) for key in tf.keys()}
    
    #extract most important words from the text
    key_words_new = get_top_n(tf_idf, nr_key_words)
    
    # save pdf embeddings
    vector_pdf,vector_category = vector_pdf_embedding(key_words_new,categories)
    
    #create DataFrame for categories
    categories_df = pd.concat([pd.DataFrame(categories,columns=['category']),
                               pd.DataFrame(vector_category)],axis=1)
     # generate sorted list of similar categories
    similarity_df = pd.DataFrame(((cosine_similarity(pd.DataFrame(vector_pdf).T,
                        categories_df.iloc[:,1:])+1)/2),index = ['text'],
                                 columns =categories_df.iloc[:,0])
    sorted_similarity = (similarity_df.sort_values(axis=1,by='text',ascending=False)).transpose()
    return sorted_similarity


In [None]:
similarity_classifier(text,10,categories)