In [2]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.tokenize import sent_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [3]:
stopWords = stopwords.words('english')
print(stopWords)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [4]:
text1 = "Fifty people were killed and 50 others wounded in a terror attack on two mosques in Christchurch, New Zealand, police announced Sunday.The unprecedented mass shootings were carefully planned, and have shocked the usually peaceful nation. New Zealand Prime Minister Jacinda Ardern described the attack on Friday as one of her countrys darkest days. She said that the suspects held extremist views that have no place in New Zealand or anywhere else in the world."

In [5]:
text2 = "The number of people killed in Friday’s massacre in Christchurch rose to 50 when another body was discovered at the Al Noor mosque, where most of the victims were killed. A total of 34 patients injured in the attack remain in Christchurch Hospital, including 12 people in intensive care."

In [6]:
wordnet_lemmatizer = WordNetLemmatizer()
def normalize(sentence):
    l1=list()
    sentence_words = nltk.word_tokenize(sentence)
    #print(sentence_words)
    for word in sentence_words:
        #print(word)
        #print(wordnet_lemmatizer.lemmatize(word))
        l1.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        print(l1)
    return l1

In [7]:
vectorizer = TfidfVectorizer(tokenizer=normalize)
def cosine_sim(text1, text2):
    tfidf = vectorizer.fit_transform([text1, text2])
    return ((tfidf * tfidf.T).A)[0,1]

In [8]:
cosine_sim(text1,text2)

['fifty']
['fifty', 'people']
['fifty', 'people', 'be']
['fifty', 'people', 'be', 'kill']
['fifty', 'people', 'be', 'kill', 'and']
['fifty', 'people', 'be', 'kill', 'and', '50']
['fifty', 'people', 'be', 'kill', 'and', '50', 'others']
['fifty', 'people', 'be', 'kill', 'and', '50', 'others', 'wound']
['fifty', 'people', 'be', 'kill', 'and', '50', 'others', 'wound', 'in']
['fifty', 'people', 'be', 'kill', 'and', '50', 'others', 'wound', 'in', 'a']
['fifty', 'people', 'be', 'kill', 'and', '50', 'others', 'wound', 'in', 'a', 'terror']
['fifty', 'people', 'be', 'kill', 'and', '50', 'others', 'wound', 'in', 'a', 'terror', 'attack']
['fifty', 'people', 'be', 'kill', 'and', '50', 'others', 'wound', 'in', 'a', 'terror', 'attack', 'on']
['fifty', 'people', 'be', 'kill', 'and', '50', 'others', 'wound', 'in', 'a', 'terror', 'attack', 'on', 'two']
['fifty', 'people', 'be', 'kill', 'and', '50', 'others', 'wound', 'in', 'a', 'terror', 'attack', 'on', 'two', 'mosques']
['fifty', 'people', 'be', 'kill'

0.39446894813716843

In [9]:
textt1 = "This Agreement is governed by the laws of the State of Missouri without reference to its conflict of law principles."

In [10]:
textt2 = "The laws of the State of Missouri shall apply to this Agreement."

In [11]:
cosine_sim(textt1,textt2)

['this']
['this', 'agreement']
['this', 'agreement', 'be']
['this', 'agreement', 'be', 'govern']
['this', 'agreement', 'be', 'govern', 'by']
['this', 'agreement', 'be', 'govern', 'by', 'the']
['this', 'agreement', 'be', 'govern', 'by', 'the', 'laws']
['this', 'agreement', 'be', 'govern', 'by', 'the', 'laws', 'of']
['this', 'agreement', 'be', 'govern', 'by', 'the', 'laws', 'of', 'the']
['this', 'agreement', 'be', 'govern', 'by', 'the', 'laws', 'of', 'the', 'state']
['this', 'agreement', 'be', 'govern', 'by', 'the', 'laws', 'of', 'the', 'state', 'of']
['this', 'agreement', 'be', 'govern', 'by', 'the', 'laws', 'of', 'the', 'state', 'of', 'missouri']
['this', 'agreement', 'be', 'govern', 'by', 'the', 'laws', 'of', 'the', 'state', 'of', 'missouri', 'without']
['this', 'agreement', 'be', 'govern', 'by', 'the', 'laws', 'of', 'the', 'state', 'of', 'missouri', 'without', 'reference']
['this', 'agreement', 'be', 'govern', 'by', 'the', 'laws', 'of', 'the', 'state', 'of', 'missouri', 'without', 'r

0.6353562124318741

In [12]:
testttt1 = "I ate an apple yesterday."
testttt2 = "I am eating an apple right now."

In [15]:
cosine_sim(testttt1,testttt2)

['i']
['i', 'eat']
['i', 'eat', 'an']
['i', 'eat', 'an', 'apple']
['i', 'eat', 'an', 'apple', 'yesterday']
['i', 'eat', 'an', 'apple', 'yesterday', '.']
['i']
['i', 'be']
['i', 'be', 'eat']
['i', 'be', 'eat', 'an']
['i', 'be', 'eat', 'an', 'apple']
['i', 'be', 'eat', 'an', 'apple', 'right']
['i', 'be', 'eat', 'an', 'apple', 'right', 'now']
['i', 'be', 'eat', 'an', 'apple', 'right', 'now', '.']


0.5727393584196199