In [54]:
#Text Similarity
import re
from collections import Counter
import pandas as pd
import numpy as np
from numpy.linalg import norm
import plotly.express as px

In [2]:
data = {'sentences':["As spring unfolds, the warmth of the season encourages the first blossoms to open, signaling longer days ahead.",
"Spring brings not only blooming flowers but also the anticipation of sunny days and outdoor activities.",
"With the arrival of spring, people begin planning their summer vacations, eager to enjoy the seasonal warmth.",
"The mild spring weather marks the transition from the cold winter to the inviting warmth of summer.",
"During spring, families often start spending more time outdoors, enjoying the season's pleasant temperatures and the promise of summer fun.",
"Summer continues the season's trend of growth and warmth, with gardens full of life and days filled with sunlight.",
"The summer season is synonymous with outdoor adventures and enjoying the extended daylight hours that began in spring.",
"As summer arrives, the warm weather invites a continuation of the outdoor activities that people began enjoying in spring.",
"The transition into summer brings even warmer temperatures, allowing for beach visits and swimming, much awaited since the spring.",
"Summer vacations are often planned as the days grow longer, a pattern that starts in the spring, culminating in peak summer leisure."]}

In [3]:
def tokenize(sentence):
    sent = re.sub(r'[^\w\s]', '', sentence).lower()
    return Counter(sent.split())

In [4]:
vocab = Counter()
for sent in data['sentences']:
    sent = tokenize(sent)
    vocab += sent

In [14]:
print(sorted(vocab.keys()))

['a', 'activities', 'adventures', 'ahead', 'allowing', 'also', 'and', 'anticipation', 'are', 'arrival', 'arrives', 'as', 'awaited', 'beach', 'began', 'begin', 'blooming', 'blossoms', 'brings', 'but', 'cold', 'continuation', 'continues', 'culminating', 'daylight', 'days', 'during', 'eager', 'encourages', 'enjoy', 'enjoying', 'even', 'extended', 'families', 'filled', 'first', 'flowers', 'for', 'from', 'full', 'fun', 'gardens', 'grow', 'growth', 'hours', 'in', 'into', 'invites', 'inviting', 'is', 'leisure', 'life', 'longer', 'marks', 'mild', 'more', 'much', 'not', 'of', 'often', 'only', 'open', 'outdoor', 'outdoors', 'pattern', 'peak', 'people', 'planned', 'planning', 'pleasant', 'promise', 'season', 'seasonal', 'seasons', 'signaling', 'since', 'spending', 'spring', 'start', 'starts', 'summer', 'sunlight', 'sunny', 'swimming', 'synonymous', 'temperatures', 'that', 'the', 'their', 'time', 'to', 'transition', 'trend', 'unfolds', 'vacations', 'visits', 'warm', 'warmer', 'warmth', 'weather', 

In [106]:
all_dta = dict()
count = 0
for sent in data['sentences']:
    cur_sent = tokenize(sent)
    for word in vocab.keys():
        if word not in cur_sent:
            cur_sent[word] = 0
    cur_sent = {i: cur_sent[i] for i in sorted(cur_sent.keys())}
    print(cur_sent)
    count += 1
    all_dta.update({count: list(cur_sent.values())})

{'a': 0, 'activities': 0, 'adventures': 0, 'ahead': 1, 'allowing': 0, 'also': 0, 'and': 0, 'anticipation': 0, 'are': 0, 'arrival': 0, 'arrives': 0, 'as': 1, 'awaited': 0, 'beach': 0, 'began': 0, 'begin': 0, 'blooming': 0, 'blossoms': 1, 'brings': 0, 'but': 0, 'cold': 0, 'continuation': 0, 'continues': 0, 'culminating': 0, 'daylight': 0, 'days': 1, 'during': 0, 'eager': 0, 'encourages': 1, 'enjoy': 0, 'enjoying': 0, 'even': 0, 'extended': 0, 'families': 0, 'filled': 0, 'first': 1, 'flowers': 0, 'for': 0, 'from': 0, 'full': 0, 'fun': 0, 'gardens': 0, 'grow': 0, 'growth': 0, 'hours': 0, 'in': 0, 'into': 0, 'invites': 0, 'inviting': 0, 'is': 0, 'leisure': 0, 'life': 0, 'longer': 1, 'marks': 0, 'mild': 0, 'more': 0, 'much': 0, 'not': 0, 'of': 1, 'often': 0, 'only': 0, 'open': 1, 'outdoor': 0, 'outdoors': 0, 'pattern': 0, 'peak': 0, 'people': 0, 'planned': 0, 'planning': 0, 'pleasant': 0, 'promise': 0, 'season': 1, 'seasonal': 0, 'seasons': 0, 'signaling': 1, 'since': 0, 'spending': 0, 'spri

In [107]:
df = pd.DataFrame.from_dict(all_dta, orient='index', columns=sorted(vocab.keys()))
df

Unnamed: 0,a,activities,adventures,ahead,allowing,also,and,anticipation,are,arrival,...,trend,unfolds,vacations,visits,warm,warmer,warmth,weather,winter,with
1,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
2,0,1,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,1,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,1,0
5,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,2,0,0,0,...,1,0,0,0,0,0,1,0,0,2
7,0,0,1,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
8,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,0,0
9,0,0,0,0,1,0,1,0,0,0,...,0,0,0,1,0,1,0,0,0,0
10,1,0,0,0,0,0,0,0,1,0,...,0,0,1,0,0,0,0,0,0,0


In [81]:
def cosineSim(v1, v2):
    v1 = np.array(v1)
    v2 = np.array(v2).T
    return v1@v2/(norm(v1)*norm(v2))

In [103]:
cosines = dict()
def get_cosine_matrix():
    sent_count = 1
    for i in range(10):
        column = []
        key = f"sentence {sent_count}"
        sent_count += 1
        for j in range(10):
            column.append(cosineSim(df.iloc[[i]], df.iloc[[j]])[0][0])
        cosines.update({key: column})
get_cosine_matrix()
print(cosines)

{'sentence 1': [1.0000000000000002, 0.3061862178478973, 0.468292905790847, 0.6064784348631228, 0.3481553119113957, 0.2857738033247041, 0.3651483716701107, 0.40089186286863665, 0.3118047822311618, 0.38575837490522974], 'sentence 2': [0.3061862178478973, 1.0, 0.22941573387056174, 0.2785430072655778, 0.26650089544451305, 0.3, 0.2795084971874737, 0.3273268353539886, 0.2727723627949905, 0.1889822365046136], 'sentence 3': [0.468292905790847, 0.22941573387056174, 0.9999999999999998, 0.5538186196949965, 0.3423811916311629, 0.3670651741928988, 0.3590924232298039, 0.40050093945740706, 0.3003757045930553, 0.3468439878096479], 'sentence 4': [0.6064784348631228, 0.2785430072655778, 0.5538186196949965, 1.0000000000000002, 0.43549417035569277, 0.297112541083283, 0.41522739926869984, 0.4862645390838647, 0.44574249416020933, 0.386024323488978], 'sentence 5': [0.3481553119113957, 0.26650089544451305, 0.3423811916311629, 0.43549417035569277, 1.0, 0.34112114616897665, 0.38138503569823684, 0.37219368415938

In [109]:
df_cos = pd.DataFrame.from_dict(cosines, orient='index', columns=[f"sentence {i}" for i in range(1,11)])
df_cos

Unnamed: 0,sentence 1,sentence 2,sentence 3,sentence 4,sentence 5,sentence 6,sentence 7,sentence 8,sentence 9,sentence 10
sentence 1,1.0,0.306186,0.468293,0.606478,0.348155,0.285774,0.365148,0.400892,0.311805,0.385758
sentence 2,0.306186,1.0,0.229416,0.278543,0.266501,0.3,0.279508,0.327327,0.272772,0.188982
sentence 3,0.468293,0.229416,1.0,0.553819,0.342381,0.367065,0.359092,0.400501,0.300376,0.346844
sentence 4,0.606478,0.278543,0.553819,1.0,0.435494,0.297113,0.415227,0.486265,0.445742,0.386024
sentence 5,0.348155,0.266501,0.342381,0.435494,1.0,0.341121,0.381385,0.372194,0.372194,0.322329
sentence 6,0.285774,0.3,0.367065,0.297113,0.341121,1.0,0.31305,0.218218,0.218218,0.188982
sentence 7,0.365148,0.279508,0.359092,0.415227,0.381385,0.31305,1.0,0.536745,0.341565,0.422577
sentence 8,0.400892,0.327327,0.400501,0.486265,0.372194,0.218218,0.536745,1.0,0.285714,0.494872
sentence 9,0.311805,0.272772,0.300376,0.445742,0.372194,0.218218,0.341565,0.285714,1.0,0.288675
sentence 10,0.385758,0.188982,0.346844,0.386024,0.322329,0.188982,0.422577,0.494872,0.288675,1.0


In [111]:
fig = px.imshow(df_cos)
fig.show()