# Plagarism Checker - 17PW24

## Question 1

### Add dataset from the problem

In [48]:
data = [ 
    'Information requirement: query considers the user feedback as information requirement to search',
    'Information retrieval: query depends on the model of information retrieval used',
    'Prediction problem: Many problems in information retrieval can be viewed as prediction problems',
    'Search: A search engine is one of applications of information retrieval models',
]

### Split title and content from the dataset

In [49]:
titles = [] 
contents = []
for doc in data:
    split_data = doc.strip().split(':')
    titles.append(split_data[0].rstrip().lower())
    contents.append(split_data[1].lstrip())

### Get all stop words

In [50]:
from nltk.corpus import stopwords
stopwords_all = stopwords.words("english")

### Tokenize and lemmatize the data

In [51]:
# import nltk
# nltk.download('wordnet')

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer, word_tokenize

englishLemmatizer = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')

# For title
title_set = set([])
for title in titles:
    lematized_word_list = [englishLemmatizer.lemmatize(word.lower()) for word in tokenizer.tokenize(title) if word not in stopwords_all]
    title_set.update(lematized_word_list)
print(title_set)

# For content
content_set = set([])
for content in contents:
    lematized_word_list = [englishLemmatizer.lemmatize(word.lower()) for word in tokenizer.tokenize(content) if word not in stopwords_all]
    content_set.update(lematized_word_list)
print(content_set)



{'requirement', 'prediction', 'information', 'search', 'retrieval', 'problem'}
{'user', 'search', 'query', 'information', 'requirement', 'prediction', 'retrieval', 'viewed', 'many', 'used', 'application', 'problem', 'feedback', 'depends', 'one', 'model', 'considers', 'a', 'engine'}


### Setting up a pipeline of several functions as prescribed in the problem

In [52]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

def get_preprocessed_content(contents):
    content_set = set([])
    for content in contents:
        lematized_word_list = [englishLemmatizer.lemmatize(word.lower()) for word in tokenizer.tokenize(content) if word not in stopwords_all]
        content_set.update(lematized_word_list)
    return list(content_set)

def fit_count_frequency(vocabulary, contents, binary=False):
    term_freq = CountVectorizer(vocabulary=vocabulary, binary=binary).transform(contents).toarray()
    return term_freq

def get_tfidf(vocabulary, contents):
    term_freq = fit_count_frequency(vocabulary, contents, binary=False)
    weights_term_per_doc = TfidfTransformer().fit_transform(term_freq).toarray()
    return term_freq, weights_term_per_doc

def fit_tfidf(vocabulary, contents):
    pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),('tfid', TfidfTransformer())]).fit(contents)
    term_freq = pipe['count'].transform(contents).toarray()
    weights_term_per_doc = pipe['tfid'].fit_transform(term_freq).toarray()
    return term_freq, weights_term_per_doc

### Computing the values from the defined functions

In [53]:
title_array = fit_count_frequency(list(title_set), titles, True)
print(title_array)

term_freq, weights_term_per_doc = fit_tfidf(list(content_set), contents)
print(weights_term_per_doc)

[[1 0 1 0 0 0]
 [0 0 1 0 1 0]
 [0 1 0 0 0 1]
 [0 0 0 1 0 0]]
[[0.42580171 0.33570696 0.33570696 0.222201   0.42580171 0.
  0.         0.         0.         0.         0.         0.
  0.42580171 0.         0.         0.         0.42580171 0.
  0.        ]
 [0.         0.         0.38014737 0.25161565 0.         0.
  0.30776206 0.         0.         0.48216873 0.         0.
  0.         0.48216873 0.         0.48216873 0.         0.
  0.        ]
 [0.         0.         0.         0.2720387  0.         0.52130524
  0.33274238 0.52130524 0.52130524 0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.        ]
 [0.         0.43391936 0.         0.28720678 0.         0.
  0.35129512 0.         0.         0.         0.         0.
  0.         0.         0.55037169 0.         0.         0.
  0.55037169]]


### Define function to compute cosine similarity


In [54]:
import pandas as pd
import numpy as np

def cosine_similarity_fn(td_matrix, query_w):
    numerator = np.matmul(td_matrix,np.transpose(query_w)).reshape(1,-1)
    square_sum_td = np.sum(np.square(td_matrix),axis=1)
    square_sum_query = np.repeat(np.sum(np.square(query_w)),td_matrix.shape[0])
    denom = np.multiply(square_sum_td, square_sum_query)
    return numerator/(denom)

### Add new documents and check for plagarism

In [55]:
queries = [
    'Feedback: feedback is typically used by the system to modify the query and improve prediction',
    'information retrieval: ranking in information retrieval algorithms depends on user query'
]

for query in queries:
    title, body = query.strip().split(":")
    body = body.lstrip().lower()
    header_freq = fit_count_frequency(list(title_set),[title.strip().lower()],True).ravel()
    diff = np.sum(np.abs(title_array - header_freq), axis=1)
    if not np.all(diff):
        print("Duplicate Heading Detected: ", title)
    else:
        query_term,query_idf = get_tfidf(list(content_set), [body])
        cosine_similarities = cosine_similarity_fn(weights_term_per_doc, query_idf).ravel()
        if max(cosine_similarities) < 0.85:
            rankedlist = sorted(range(len(cosine_similarities)), key=lambda i: cosine_similarities[i], reverse=True)[:10]
            print("Top related documents for query", query)
            for index in rankedlist:
                print("Document "+ str(index+1),"\t", cosine_similarities[index])
                titles.append(title.strip().lower())
                contents.append(body.strip().lower())
                title_list = get_preprocessed_content(titles)
                content_list = get_preprocessed_content(contents)
                header_array = fit_count_frequency(title_list, titles, True)
                weights_term_per_doc = fit_tfidf(content_list, contents)
                print("=============================")
        else:
            print("Plagirism detected with existing document")
            print("Document related closer to ", sorted(range(len(cosine_similarities)), key=lambda i: cosine_similarities[i], reverse=True)[0])


Top related documents for query Feedback: feedback is typically used by the system to modify the query and improve prediction
Document 2 	 0.4311580502744009
Document 1 	 0.38075433594185293
Document 3 	 0.26065261909906917
Document 4 	 0.0
Duplicate Heading Detected:  information retrieval


### Code for generating K shingles and binary jaccard similariy

In [59]:
def generate_kshingles(k,contents):
    c_vectorizer = CountVectorizer(analyzer='word', ngram_range=(k,k))
    return c_vectorizer, c_vectorizer.fit_transform(contents).toarray()

def jaccard_binary(x,y):
    intersection = np.logical_and(x, y)
    union = np.logical_or(x, y)
    similarity = intersection.sum() / float(union.sum())
    return similarity


### Generating 3-shingles

In [64]:
vectorizer, count_array = generate_kshingles(3 ,contents)
for i in range(count_array.shape[0]):
    for j in range(count_array.shape[0]):
        if i != j and i < j:
            jaccard_similarity = jaccard_binary(count_array[i], count_array[j])
            print("Similarity between ", i+1 , " and ", j+1 , " is ", jaccard_similarity)

Similarity between  1  and  2  is  0.0
Similarity between  1  and  3  is  0.0
Similarity between  1  and  4  is  0.0
Similarity between  1  and  5  is  0.0
Similarity between  1  and  6  is  0.0
Similarity between  1  and  7  is  0.0
Similarity between  1  and  8  is  0.0
Similarity between  2  and  3  is  0.0
Similarity between  2  and  4  is  0.07142857142857142
Similarity between  2  and  5  is  0.0
Similarity between  2  and  6  is  0.0
Similarity between  2  and  7  is  0.0
Similarity between  2  and  8  is  0.0
Similarity between  3  and  4  is  0.0
Similarity between  3  and  5  is  0.0
Similarity between  3  and  6  is  0.0
Similarity between  3  and  7  is  0.0
Similarity between  3  and  8  is  0.0
Similarity between  4  and  5  is  0.0
Similarity between  4  and  6  is  0.0
Similarity between  4  and  7  is  0.0
Similarity between  4  and  8  is  0.0
Similarity between  5  and  6  is  1.0
Similarity between  5  and  7  is  1.0
Similarity between  5  and  8  is  1.0
Similarit

## Question 2

### Source Code Plagarism Detection

### Function to read code from the specified directory

In [91]:
def read_code(dir_name):
    import os
    data = []
    for filename in os.listdir(dir_name):
        with open(os.path.join(dir_name, filename), 'r') as f:
            doc_content = ""
            for line in f.readlines():
                doc_content += line.strip() + " "
            data.append(doc_content)
    return data

### Read original code

In [92]:
data = read_code('code')
print(data)

['for i in range(0, 5): print("Hello Python") ']


### Preproccess the python code

In [93]:
processed_data = get_preprocessed_content(data)
print(processed_data)

term_freq_cs, weights_term_per_doc_cs = fit_tfidf(processed_data, data)
print(weights_term_per_doc_cs)

['5', 'range', '0', 'python', 'print', 'hello']
[[0.  0.5 0.  0.5 0.5 0.5]]


### Read duplicate code for queries

In [94]:
queries = read_code('dup_code')
print(queries)

['print("Hello Python") print("Hello Python") print("Hello Python") print("Hello Python") print("Hello Python") ']


### Compare docs with cosine similarity

In [96]:
for i,query in enumerate(queries):
    body = query.lower()
    query_term,query_idf = get_tfidf(processed_data, [body])
    cosine_similarities = cosine_similarity_fn(weights_term_per_doc_cs, query_idf).ravel()
    print("Similarity of p{0} with Documents corpus".format(i+1))
    for cs_i,cs in enumerate(cosine_similarities):
        print("Code-{0}".format(cs_i)," ",cs)
    print("=======================")


Similarity of p1 with Documents corpus
Code-0   0.8660254037844386
