In [2]:
import pandas as pd
import csv
import nltk
import numpy as np

In [3]:
import matplotlib.pyplot as plt

In [4]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from scipy.spatial import distance

In [5]:
text="Hello there, General Kenobi"
tokens = word_tokenize(text)

In [6]:
bigrams_list = [grams for grams in ngrams(tokens, 2)]
print(bigrams_list)

[('Hello', 'there'), ('there', ','), (',', 'General'), ('General', 'Kenobi')]


In [7]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(2, 2))
bigrams_list = [" ".join(bigram) for bigram in bigrams_list]
tfidf_matrix = tfidf_vectorizer.fit_transform(bigrams_list)

In [8]:
# compute the TF-IDF vectors
tfidf_vectors = tfidf_vectorizer.fit_transform(bigrams_list)
# calculate the cosine similarity between the vectors
similarity_matrix = cosine_similarity(tfidf_vectors)

In [9]:
# calculate the average similarity score
average_similarity = np.mean(similarity_matrix)

# print the average similarity score
print("Average similarity score:", average_similarity)

Average similarity score: 0.125


In [10]:
text1="object-oriented programming inheritance way form new class instance called object using class already defined inheritance concept invented 1967 simula new class known derived class take inherit attribute behavior pre-existing class referred base class ancestor class intended help reuse existing code little modification inheritance provides support representation categorization computer language categorization powerful mechanism number information processing crucial human learning mean generalization known specific entity applied wider group given belongs relation established cognitive economy le information need stored specific entity particularity inheritance also sometimes called generalization is-a relationship represent hierarchy class object instance `` fruit '' generalization `` apple '' `` orange '' `` mango '' many others one consider fruit abstraction apple orange etc conversely since apple fruit i.e. apple is-a fruit apple may naturally inherit property common fruit fleshy container seed plant advantage inheritance module sufficiently similar interface share lot code reducing complexity program inheritance therefore another view dual called polymorphism describes many piece code controlled shared control code inheritance typically accomplished either overriding replacing one method exposed ancestor adding new method exposed ancestor complex inheritance inheritance used within design sufficiently mature may lead yo-yo problem "
text2="inheritance ability subclass inherit default protected public attribute method superclass object except java.lang.object cast object one superclass however object cast class relative example inheritance class living thing attribute like weight age class animal plant virus fungi subclass class living thing animal unique attribute organ hair etc method walking mating etc. also inherit attribute method superclass animal treated cast living thing however animal treated fungi object oriented programming inheritance also dependant access level modifier example private attribute method inherited virtual attribute method shadowed/overridden java attribute method implicitly virtual object variable store reference class subclass i.e specialised version however object variable store reference superclass i.e le specialised version original class "

In [74]:
text1 = "Original texts are rarely found but their copies are readily available on the internet"
text2 = "Additional texts are rarely seen but some can be available on the internet"

In [75]:
def tokenization(text1,text2,n):
    tokens1 = word_tokenize(text1)
    tokens2 = word_tokenize(text2)
    ngrams_list1 = [grams for grams in ngrams(tokens1, n)]
    ngrams_list2 = [grams for grams in ngrams(tokens2, n)]
    ngrams_list1 = [" ".join(ngram) for ngram in ngrams_list1]
    ngrams_list2 = [" ".join(ngram) for ngram in ngrams_list2]
    return ngrams_list1,ngrams_list2

In [76]:
def calculate_similarity(text1,text2,n):
    # initialize the TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer()
    #tokenize lists first
    ngrams_list1,ngrams_list2=tokenization(text1,text2,n)
    #print(ngrams_list1)
    #print("-----")
    #print(ngrams_list2)
    # compute the TF-IDF vectors
    tfidf_vectors1 = tfidf_vectorizer.fit_transform(ngrams_list1)
    tfidf_vectors2 = tfidf_vectorizer.transform(ngrams_list2)
    # calculate the cosine similarity between the vectors
    similarity_score = cosine_similarity(tfidf_vectors1, tfidf_vectors2)[0][0]
    return similarity_score

In [77]:
similarity_score=calculate_similarity(text1,text2,3)
print(similarity_score)

0.7165721057262565


In [14]:
df=pd.read_csv("combined_data.csv")
df["bigram_similarity"] = None

In [15]:
# Iterate through the first 10 rows of the dataframe
for index, row in df.iterrows():
        #text1="person horse jump broken airplane ."
        #text2="person ."
        text1 = row["Original"]
        text2 = row["Additional"]
        #print(text1)
        #print(text2)
        similarity = calculate_similarity(text1, text2,2)
        df.at[index, "bigram_similarity"] = similarity
# save the dataframe to a new csv file
df.to_csv("added.csv", index=False)
            

In [16]:
df=pd.read_csv("added2.csv")
df["bigram_similarity"] = None

In [17]:
# Iterate through the first 10 rows of the dataframe
for index, row in df.iterrows():
        #text1="person horse jump broken airplane ."
        #text2="person ."
        text1 = row["Original"]
        text2 = row["Additional"]
        #print(text1)
        #print(text2)
        similarity = calculate_similarity(text1, text2,2)
        df.at[index, "biigram_similarity"] = similarity
# save the dataframe to a new csv file
df.to_csv("added3.csv", index=False)
            

In [18]:
df.head()

Unnamed: 0,Original,Additional,label,bigram_similarity,trigram_similarity,biigram_similarity
0,object-oriented programming inheritance way fo...,inheritance basic concept object-oriented prog...,0,,0.170391,0.0
1,object-oriented programming inheritance way fo...,inheritance basic concept object oriented prog...,0,,0.170391,0.0
2,object-oriented programming inheritance way fo...,inheritance object oriented programming new cl...,1,,0.826369,0.404976
3,object-oriented programming inheritance way fo...,inheritance object oriented programming way fo...,1,,0.826369,0.404976
4,object-oriented programming inheritance way fo...,object-oriented programming inheritance way fo...,1,,1.0,1.0


In [19]:
df.tail()

Unnamed: 0,Original,Additional,label,bigram_similarity,trigram_similarity,biigram_similarity
90,mathematics computer science dynamic programmi...,computer science mathematics dynamic programmi...,1,,1.0,0.434788
91,mathematics computer science dynamic programmi...,mathematics computer science dynamic programmi...,1,,1.0,1.0
92,mathematics computer science dynamic programmi...,mathematics computer science dynamic programmi...,1,,1.0,1.0
93,mathematics computer science dynamic programmi...,dynamic programming method providing solution ...,1,,0.0,0.0
94,mathematics computer science dynamic programmi...,dynamic programming method efficiently solving...,0,,0.0,0.0


In [37]:
df=pd.read_csv("final.csv")

In [38]:
df.head()

Unnamed: 0,Original,Additional,label,trigram_similarity,bigram_similarity
0,object-oriented programming inheritance way fo...,inheritance basic concept object-oriented prog...,0,0.170391,0.0
1,object-oriented programming inheritance way fo...,inheritance basic concept object oriented prog...,0,0.170391,0.0
2,object-oriented programming inheritance way fo...,inheritance object oriented programming new cl...,1,0.826369,0.404976
3,object-oriented programming inheritance way fo...,inheritance object oriented programming way fo...,1,0.826369,0.404976
4,object-oriented programming inheritance way fo...,object-oriented programming inheritance way fo...,1,1.0,1.0


In [34]:
from collections import Counter

#text1 = "The quick brown fox jumps over the lazy dog."
#text2 = "A quick brown fox was spotted playing with the lazy dog."


# Tokenize the text into bigrams
bigrams1 = [" ".join(bigram) for bigram in ngrams(word_tokenize(text1), 2)]
bigrams2 = [" ".join(bigram) for bigram in ngrams(word_tokenize(text2), 2)]

# Convert the bigrams lists to sets
set1 = set(bigrams1)
set2 = set(bigrams2)

# Calculate the Jaccard similarity
jaccard_similarity = len(set1.intersection(set2)) / len(set1.union(set2))

print("Jaccard similarity:", jaccard_similarity)


Jaccard similarity: 0.416988416988417


In [43]:
from collections import Counter

In [63]:
df=pd.read_csv("final.csv")
df["bigram_jaccard_similarity"] = None

In [64]:
def tokenization(text1,text2,n):
    tokens1 = word_tokenize(text1)
    tokens2 = word_tokenize(text2)
    ngrams_list1 = [grams for grams in ngrams(tokens1, n)]
    ngrams_list2 = [grams for grams in ngrams(tokens2, n)]
    ngrams_list1 = [" ".join(ngram) for ngram in ngrams_list1]
    ngrams_list2 = [" ".join(ngram) for ngram in ngrams_list2]
    return ngrams_list1,ngrams_list2

In [80]:
def calculate_similarity(text1,text2,n):
    # Convert the bigrams lists to sets
    set1 = set(bigrams1)
    set2 = set(bigrams2)

    # Calculate the Jaccard similarity
    jaccard_similarity = len(set1.intersection(set2)) / len(set1.union(set2))
    return jaccard_similarity

In [66]:
# Iterate through the first 10 rows of the dataframe
for index, row in df.iterrows():
        text1 = row["Original"]
        text2 = row["Additional"]
        similarity = calculate_similarity(text1, text2,2)[0][0]
        df.at[index, "bigram_jaccard_similarity"] = similarity
# save the dataframe to a new csv file
df.to_csv("final2.csv", index=False)

In [82]:
df=pd.read_csv("final2.csv")
df["trigram_jaccard_similarity"] = None
# Iterate through the first 10 rows of the dataframe
for index, row in df.iterrows():
        text1 = row["Original"]
        text2 = row["Additional"]
        bigrams1 = [" ".join(bigram) for bigram in ngrams(word_tokenize(text1), 3)]
        bigrams2 = [" ".join(bigram) for bigram in ngrams(word_tokenize(text2), 3)]
        similarity = calculate_similarity(bigrams1, bigrams2,3)
        df.at[index, "trigram_jaccard_similarity"] = similarity
# save the dataframe to a new csv file
df.to_csv("final2.csv", index=False)

In [73]:
text1 = "Original texts are rarely found but their copies are readily available on the internet"
text2 = "Additional texts are rarely seen but some can be available on the internet"
bigrams1 = [" ".join(bigram) for bigram in ngrams(word_tokenize(text1), 2)]
bigrams2 = [" ".join(bigram) for bigram in ngrams(word_tokenize(text2), 2)]
similarity = calculate_similarity(bigrams1, bigrams2,2)
print(similarity);

0.25
