# Simple plagiarism check 
### Note:  The app is able to check similarity only

This is the sketch for the idea of my back-end thesis product (in progress for front-end and back-end).

The use of for-loop has been minimalised to the maximum (since iteration requires a lot of memory space). 

### Agenda:
* Open the document
* Extract Hyperlinks (references)
* Store the HyperLinks in the Local Database
* Scrape data from each HyperLink
* Compare the similarity between documents with:
    * Jaccard Similarity
    * Cosine Similarity (Tf-idf)
    * Semantic Similarity (word2vec) with pretrained Google model
    * Semantic Similarity (doc2vec): in progress

### Future:
* Improve the speed with big data
* Return copied text

#### Pretrained model: 
[GoogleNews-vectors-negative300.bin](https://code.google.com/archive/p/word2vec/)

# Preparation

In [1]:
import numpy as np
import PyPDF2
from docx.opc.constants import RELATIONSHIP_TYPE as RT
from docx import Document
import pymysql
from bs4 import BeautifulSoup  
import requests, io, re
import string
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords, wordnet
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import KeyedVectors, Word2Vec
from scipy import spatial



In [2]:
def readFile(doc):
    '''
    Read a text file and return a string
    '''
    fullText = []
    translator = str.maketrans('', '', '\t\n')
    for para in doc.paragraphs:
        fullText.append(para.text)
    
    return '\n'.join(fullText).translate(translator)    

In [3]:
def onlRefs(rels):
    '''
    Extract all the hyperlinks (references) in the document
    '''
    links = []
    for rel in rels:
        if rels[rel].reltype == RT.HYPERLINK:
            links.append(rels[rel]._target)
            
    return links

In [4]:
def readContent(link):    
    '''
    Depend on the website (pdf or regular html)
    Open the file and scrape the data of 01 site
    Libraries used: 
    bs4 -> BeautifulSoup 
    requests, io, re
    '''
    
    string = []
    translator = str.maketrans('', '', '\n\t')
   
    # if the link is a pdf
    if (r'.pdf' in link.split('/')[-1]):
        title = link.split('/')[-1]
        response = requests.get(link)
        raw_data = response.content
        pdf_content = io.BytesIO(raw_data)
        pdf_reader = PyPDF2.PdfFileReader(pdf_content)
        for page in range(pdf_reader.numPages):
            string.append(pdf_reader.getPage(page).extractText())
        return link, title, ' '.join(string).translate(translator)
    
    # if not
    else:
        def scrape_data():
            '''
            Return title + content of a webpage 
            '''
            page = requests.get(link)
            title = BeautifulSoup(page.text, 'html.parser').head.title.contents
            text = BeautifulSoup(page.text, 'html.parser').find_all('p')
            for p in text:
                string.append(p.get_text())
            return link, title, ' '.join(string).replace(u'\xa0', ' ').translate(translator)
        
        try:
            return scrape_data()
        
        #some links need authentication
        except:
            
            headers = {'User-Agent':'Mozilla/5.0'}             
            #class AppURLopener(urllib.request.FancyURLopener):
                #version = "Mozilla/5.0"
            #opener = AppURLopener()
            return scrape_data()

In [5]:
def push_ref(link):
    '''
    Push (link, title, content) scraped from 01 website to local db
    Library used: pymysql
    '''
    #Open database
    db = pymysql.connect(host = '127.0.0.1',
                          user = 'root',
                           db = 'references')
    
    cursor = db.cursor()   
    
    #check distinct reference to add to the database
    sql = "INSERT INTO onlref (link, title, content) VALUES (%s, %s, %s)"
    
    try:
        cursor.execute(sql, readContent(link))
    except:
        pass

    db.commit()
    db.close()

In [6]:
def get_data():
    db = pymysql.connect(host = '127.0.0.1',
                          user = 'root',
                           db = 'references')
    cursor = db.cursor()
    sql = "SELECT link, content FROM onlref"
    cursor.execute(sql)
    
    records = cursor.fetchall() #((link_1, content_1), (link_2, content_2), etc.)
    
    
    #db.commit()
    db.close()
    
    links = set([rec[0] for rec in records])
    corpus = [rec[1] for rec in records]
    
    
    return links, corpus

In [7]:
def getToken(text):
    '''
    Tokenise + Omit punctuation
    Libraries: 
        nltk.tokenize -> word_tokenize, 
        nltk.corpus -> stopwords,
        string
    '''
    translator = str.maketrans('', '', string.punctuation)
    stopWords = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens]
    filtered = [w for w in tokens if not w in stopWords]
    
    return ' '.join(filtered).translate(translator).split()

In [8]:
def lemmatize_append(set_tokens):
    '''
    Return a set of all lemmatized words
    '''
    lmt = WordNetLemmatizer()
    lemmatize_set = []
    for wd in set_tokens:
        lemmatize_set.append(lmt.lemmatize(wd))
    return set(lemmatize_set)

In [9]:
class Methods(object):
    def __init__(self, document):
        self.document = document
        
    def jaccard_similarity(self):
        '''
        Perform Jaccard Similarity
        Disadvantage: lemmatize document everytime
        '''
        
        doc = Document(self.document)
        rels = doc.part.rels
        tokens_a = set(getToken(readFile(doc)))
        features = []
        features.append(lemmatize_append(tokens_a))
        
        percentage = []
        
        db_links, db_contents = get_data()
        
        def jaccard(a, b, j):
            return float(len(j)/(len(a) + len(b) - len(j)))

        
        #push new refs
        for link in onlRefs(rels):
            if not link in db_links:
                push_ref(link)
        
        #first run when db is empty
        if db_contents == []:
            db_links, db_contents = get_data()
        
        #vectorize db_contents
        for corpora in db_contents: 
            features.append(lemmatize_append(getToken(corpora)))
        
        #unite sets of tokens used for comparision
        while len(features) > 2:
            features[1] = features[1].union(features[-1])
            features.pop()
            
        jac = features[0].intersection(features[-1])

        return jaccard(features[0], features[-1], jac)*100
        #return features
    
    def cosine_sim(self):
        '''
        Perform Cosine Similarity
        '''

        tokenizer = TreebankWordTokenizer()
        vect = CountVectorizer()
        vect.set_params(tokenizer=tokenizer.tokenize, stop_words='english')
        doc = Document(self.document)
        rels = doc.part.rels
        db_links, db_contents = get_data()
        
        corpus = []
        corpus.append(readFile(doc))

        for link in onlRefs(rels):
            if not link in db_links:
                push_ref(link)
            
        for corpora in db_contents:
            corpus.append(corpora)

        tfidf = vect.fit_transform(corpus)
        return (1 - cosine_similarity(tfidf)[0][1])*100
    
    def word_to_vec(self):
        '''
        Comparing the semantic similarity between documents.
        Converting tokens to numeric vector using Google pretrained document.
        Perform cosine similarity based on that.
        '''
        
        #take only first 100k most frequent tokens
        model = KeyedVectors.load_word2vec_format('D:/TUAS/GoogleNews-vectors-negative300.bin', binary=True, limit = 100000)
        
        doc = Document(self.document)
        rels = doc.part.rels
        result = []
        db_links, db_contents = get_data()
        
        for link in onlRefs(rels):
            if not link in db_links:
                push_ref(link)
        
        def convert(corpora):
            return np.mean([model[wd] for wd in getToken(corpora) if wd in model], axis=0)
            
        base_corpora = convert(readFile(doc))
        
        for corpora in db_contents:
            cal = 1 - spatial.distance.cosine(base_corpora, convert(corpora))
            result.append(cal)
            
        return np.mean(result)*100

# Running test

In [10]:
document = 'OrganisationalCourse.docx'

In [11]:
#Instantiate an object
method = Methods(document)

### Calculate The similarity + executing time

In [12]:
import time
import pandas as pd

### First run when db is empty

In [13]:
#First run when db is empty
run_time = []
result = []
tactics = ['jaccard_similarity', 'cosine_sim', 'word_to_vec (semantic)']

def counting(method):
    result.append(method)
    end = time.time()
    run_time.append(end-start)

start = time.time()
counting(method.jaccard_similarity())

start = time.time()
counting(method.cosine_sim())

start = time.time()
counting(method.word_to_vec())

pd.DataFrame({"Method": tactics,
             "Run_time": run_time,
             "Similarity (%)": result})

  result = self._query(query)
  result = self._query(query)


Unnamed: 0,Method,Run_time,Similarity (%)
0,jaccard_similarity,26.865635,4.881603
1,cosine_sim,1.174329,11.223046
2,word_to_vec (semantic),4.505424,92.569805


### Second run when db is filled

In [14]:
#Second run when db is filled
run_time = []
result = []
tactics = ['jaccard_similarity', 'cosine_sim', 'word_to_vec (semantic)']

start = time.time()
counting(method.jaccard_similarity())

start = time.time()
counting(method.cosine_sim())

start = time.time()
counting(method.word_to_vec())

pd.DataFrame({"Method": tactics,
             "Run_time": run_time,
             "Similarity (%)": result})

Unnamed: 0,Method,Run_time,Similarity (%)
0,jaccard_similarity,2.636494,4.881603
1,cosine_sim,1.117357,11.223046
2,word_to_vec (semantic),4.458456,92.569805


In [15]:
j_time = []
cs_time = []
w2v_time = []
for i in range(10):
    start = time.time()
    method.jaccard_similarity()
    end = time.time()
    j_time.append(end-start)

    start = time.time()
    method.cosine_sim()
    end = time.time()
    cs_time.append(end-start)

    start = time.time()
    method.word_to_vec()
    end = time.time()
    w2v_time.append(end-start)

In [17]:
print("When db is added, average executing time of\
      \nJaccard Similarity: %.4f sec\
      \nCosine Similarity: %.4f sec\
      \nWord_to_vec (sematic): %.4f sec" % (np.mean(j_time), np.mean(cs_time), np.mean(w2v_time)))

When db is added, average executing time of      
Jaccard Similarity: 2.6192 sec      
Cosine Similarity: 1.1446 sec      
Word_to_vec (sematic): 4.1252 sec
