# Simple plagiarism check 
### Note:  The app is yet able to check similarity only

This is the sketch for the idea of my back-end thesis product (in progress for front-end and back-end).

The idea of the project is to:
* Open the document
* Extract Hyperlinks (references)
* Store the HyperLinks in the Local Database
* Scrape data from each HyperLink
* Compare the similarity of the self-writing document with data scraped from HyperLinks

**Future**:
* (in progress) Word2Vec, gensim
* Improve the speed with big data
* Return copied text

# Preparation

In [1]:
import numpy as np
import PyPDF2
from docx.opc.constants import RELATIONSHIP_TYPE as RT
from docx import Document
import urllib
import pymysql
from bs4 import BeautifulSoup  
import requests, io, re
import string
from nltk.tokenize import word_tokenize, TreebankWordTokenizer
from nltk.corpus import stopwords, wordnet
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem import WordNetLemmatizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
def readFile(doc):
    '''
    Read a text file and return to a elements of a list
    '''
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
        
    return '\n'.join(fullText)    

In [3]:
def onlRefs(rels):
    '''
    Extract all the hyperlinks (references) in the document
    '''
    links = []
    for rel in rels:
        if rels[rel].reltype == RT.HYPERLINK:
            links.append(rels[rel]._target)
            
    return links

In [4]:
def readContent(link):    
    '''
    Depend on the website (pdf or regular html)
    Open the file and scrape the data of 01 site
    Libraries used: 
    bs4 -> BeautifulSoup 
    requests, io, re
    '''
    
    string = []
   
    # if the link is a pdf
    if (r'.pdf' in link.split('/')[-1]):
        title = link.split('/')[-1]
        response = requests.get(link)
        raw_data = response.content
        pdf_content = io.BytesIO(raw_data)
        pdf_reader = PyPDF2.PdfFileReader(pdf_content)
        for page in range(pdf_reader.numPages):
            string.append(pdf_reader.getPage(page).extractText())
        return link, title, (' '.join(string))
    
    # if not
    else:
        def scrape_data():
            '''
            Return title + content of a webpage 
            '''
            page = requests.get(link)
            title = BeautifulSoup(page.text, 'html.parser').head.title.contents
            text = BeautifulSoup(page.text, 'html.parser').find_all('p')
            for p in text:
                string.append(p.get_text())
            return link, title, (' '.join(string).replace(u'\xa0', ' ').replace(u'\n', ' '))
        
        try:
            return scrape_data()
        
        #some links need authentication
        except:
            
            headers = {'User-Agent':'Mozilla/5.0'}             
            #class AppURLopener(urllib.request.FancyURLopener):
                #version = "Mozilla/5.0"
            #opener = AppURLopener()
            return scrape_data()

In [5]:
def working_with_mySQL(readContent_result):
    '''
    Store (link, title, content) scraped from 01 website to local db
    Return content
    Library used: pymysql
    '''
    #Open database
    db = pymysql.connect(host = '127.0.0.1',
                          user = 'root',
                           db = 'references')
    
    cursor = db.cursor()   
    
    #check distinct reference to add to the database
    sql = "INSERT INTO onlref (link, title, content) VALUES (%s, %s, %s)"

    try:
        cursor.execute(sql, readContent_result)
    except:
        pass

    #fetch all the links
    cursor.execute('SELECT content FROM onlref')
    data = cursor.fetchall()

    db.commit()
    db.close()
        
    return str(data[0])

In [6]:
def getToken(text):
    '''
    Tokenise + Omit punctuation
    Libraries: 
        nltk.tokenize -> word_tokenize, 
        nltk.corpus -> stopwords,
        string
    '''
    translator = str.maketrans('', '', string.punctuation)
    stopWords = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens]
    filtered = [w for w in tokens if not w in stopWords]
    
    return ' '.join(filtered).translate(translator).split()

In [7]:
#in class
def jaccard_similarity(document):
    '''
    Checking similarity using Jaccard Similarity
    '''
    lemmatize_a= []
    lmt = WordNetLemmatizer()
    tokens_a = set(getToken(readFile(document)))
    percentage = []
    rels = document.part.rels
    
    def lemmatize_append(set_tokens):
        lemmatize_set = []
        for wd in set_tokens:
            lemmatize_set.append(lmt.lemmatize(wd))
        return set(lemmatize_set)
    
    def jaccard(a, b, j):
        return float(len(j)/(len(a) + len(b) - len(j)))
    
    lem_a = lemmatize_append(tokens_a)
    
    for link in onlRefs(rels):
        lemmmatize_b = []
        lem_b = lemmatize_append(getToken(working_with_mySQL(readContent(link))))
        jac = lem_a.intersection(lem_b)
        percentage.append(jaccard(lem_a, lem_b, jac))
    
    return np.mean(percentage)

In [8]:
def cosine(document):
    '''
    Perform Cosine Similarity
    '''
    
    tokenizer = TreebankWordTokenizer()
    vect = CountVectorizer()
    vect.set_params(tokenizer=tokenizer.tokenize, stop_words='english')
    rels = document.part.rels
    document = readFile(document)
    doc = []
    doc.append(document)
    
    for link in onlRefs(rels):
        doc.append(working_with_mySQL(readContent(link)))
        
    tfidf = vect.fit_transform(doc)
    return tfidf

# Running test

In [9]:
path = r'dir\OrganisationalCourse.docx'
_file = path.replace('\\', '/')

In [10]:
#open the docx
document = Document(_file) 

### Calculate The similarity + the executing time

In [11]:
import time
import pandas as pd

In [12]:
run_time = []
result = []

start = time.time()
result.append(jaccard_similarity(document))
end = time.time()
run_time.append(end-start)

start = time.time()
result.append(1 - cosine_similarity(cosine(document))[0][1])
end = time.time()
run_time.append(end-start)

In [13]:
pd.DataFrame({"Method": ["Jaccard Sim", "Cosine Sim"],
             "Run_time": run_time,
             "Similarity": result})

Unnamed: 0,Method,Run_time,Similarity
0,Jaccard Sim,38.275535,0.044603
1,Cosine Sim,24.831723,0.112675
