# Simple plagiarism check 
### Note:  The app is yet able to check similarity only

This is the sketch for the idea of my back-end thesis product (in progress for front-end and back-end).

The idea of the project is to:
* Open the document
* Extract Hyperlinks (references)
* Store the HyperLinks in the Local Database
* Scrape data from each HyperLink
* Compare the similarity of the self-writing document with data scraped from HyperLinks
* Take the mean -> Percentage of plagiarism

**Future**:
* Improve the speed with big data
* Return copied text

# Preparation

In [1]:
import numpy as np
import PyPDF2
from docx.opc.constants import RELATIONSHIP_TYPE as RT
from docx import Document
import urllib
import pymysql
from bs4 import BeautifulSoup  
import requests, io, re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
def readFile(doc):
    '''
    Read a text file and return to a elements of a list
    '''
    fullText = []
    for para in doc.paragraphs:
        fullText.append(para.text)
        
    return '\n'.join(fullText)    

In [3]:
def onlRefs(rels):
    '''
    Extract all the hyperlinks (references) in the document
    '''
    links = []
    for rel in rels:
        if rels[rel].reltype == RT.HYPERLINK:
            links.append(rels[rel]._target)
            
    return links

In [4]:
def working_with_mySQL(onl_ref_results):
    '''
    Store all the links to local database
    Once stored, return all the links for further work
    library: pymysql
    
    my local databse is of only one table: onlref -> column: link (primary)
    '''
    #Open database
    db = pymysql.connect(host = '127.0.0.1',
                          user = 'root',
                           db = 'references')
    
    cursor = db.cursor()   
    
    #check distinct reference to add the database
    sql = "INSERT INTO onlref (link) VALUES (%s)"
    for link in onl_ref_results:
        try:
            cursor.execute(sql, link)
        except:
            pass
    
    #fetch all the links
    cursor.execute('SELECT link FROM onlref')
    data = cursor.fetchall()

    db.commit()
    db.close()
    
    #append each link into a list
    links = []    
    for link in data:
        links.append(link[0])
    
    return links

In [5]:
def readContent(link):    
    '''
    Depend on the content of the website (pdf or regular html)
    Open the file and scrape the data
    libraries: bs4 -> BeautifulSoup with requests, io, re
    '''
    
    string = []
   
    # if the link is a pdf
    if (re.search(link.split('/')[-1], r'.pdf') is True):
        response = requests.get(link)
        raw_data = response.content
        pdf_content = io.BytesIO(raw_data)
        pdf_reader = PyPDF2.PdfFileReader(pdf_content)
        for page in range(pdf_reader.numPages + 1):
            string.append(pdf_reader.getPage(page).extractText())
        return(' '.join(string))
    
    # if not
    else:
        try:
            page = requests.get(link)
            text = BeautifulSoup(page.text, 'html.parser').find_all('p')
            for p in text:
                string.append(p.get_text())
            return(' '.join(string).replace(u'\xa0', ' ').replace(u'\n', ' '))
        
        #some links need authentication
        except:
            
            headers = {'User-Agent':'Mozilla/5.0'}             
            #class AppURLopener(urllib.request.FancyURLopener):
                #version = "Mozilla/5.0"
            #opener = AppURLopener()

            #pullData
            page = requests.get(link)
            text = BeautifulSoup(page.text, 'html.parser').find_all('p')
            for p in text:
                string.append(p.get_text())
            return(' '.join(string).replace(u'\xa0', ' ').replace(u'\n', ' '))

In [6]:
def getToken(text):
    '''
    Tokenise + Omit punctuation
    libraries: 
        nltk.tokenize -> word_tokenize, 
        nltk.corpus -> stopwords,
        string
    '''
    translator = str.maketrans('', '', string.punctuation)
    stopWords = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens]
    filtered = [w for w in tokens if not w in stopWords]
    
    return ' '.join(filtered).translate(translator)

In [7]:
def get_content(links):
    '''
    Return all the contents into a list to check similarity
    '''
    base_refs = []
    for link in links:
        base_refs.append(readContent(link))
        
    return base_refs

In [8]:
def check_similarity(document):
    '''
    Checking similarity using cosine difference
    Library: sklearn.feature_extraction.text -> TfidVectorizer
    '''
    vect = TfidfVectorizer(min_df=1)
    similarity = []
    links = working_with_mySQL(onlRefs(rels))
    
    for content in get_content(links):
        tfidf = vect.fit_transform([getToken(readFile(document)), content])
        similarity.append((tfidf * tfidf.T).A[0,1])
        
    return np.mean(similarity)

# Running test

In [9]:
path = r'dir\Organisational Course.docx'
_file = path.replace('\\', '/')

In [10]:
#open the docx
document = Document(_file)
rels = document.part.rels

In [11]:
#Checking if there onlRefs works
#Checking if the document has any references as required
if (len(onlRefs(rels)) == 0):
    print("No ref to check.")
    #raise Error('The document is missing references')
else:
    print("Great work!")

Great work!


In [12]:
check_similarity(document)

0.06365056223859772