# Project: Sentences similarity

## Team members:

* Khanh Duong TRAN.
* Brandon NGUELEWI TOUTSAP.

## Goal: Compare two sentences similarity by calculating Cosine similarity
### Main ideas:

#### 1. Vectorize the sentences.
#### 2. Calculate Cosine similarity.
#### 3. Compare the similarity.
#### 4. Visualization between the 2 vectors.

### We will try to compare these two sentences:

 * "The weather is hot today!"
 * "It is a hot day."

#### Import libraries

In [52]:
import numpy as np

### Vectorize sentences

#### Step includes:
##### 1. Process the sentences.
##### 2. Tokenize the sentences.
##### 3. Convert into array.

##### Process the sentences:
###### Step includes:

###### 1. Remove non-word character such as punctuation, numbers, emojis, etc.
###### 2. Remove redundant spaces.
###### 3. Convert all to lower case.

In [53]:
test_string = "I haVe 20 a nAme10. :)"

In [54]:
#Remove non-word
only_w_string = ''.join(char for char in test_string if char.isalpha() or char.isspace())
print(only_w_string)

I haVe  a nAme 


In [55]:
#Remove redundant whitespace
no_space_string = ' '.join(only_w_string.split())
print(no_space_string)

I haVe a nAme


In [56]:
#Convert to lowercase
to_lower_string = no_space_string.lower()
print(to_lower_string)

i have a name


In [57]:
def process_sentence(sentence):
    """ Preprocess the sentence before converting into array.

    Argument:
    sentence: The sentence we want to preprocess.

    Output: Clean sentence.
    """
    cleaned_sentence: str = ""
    only_w: str = ''.join(char for char in sentence if char.isalpha() or char.isspace())
    no_space: str = ' '.join(only_w.split())
    to_lwer: str = no_space.lower()
    cleaned_sentence = to_lwer

    return cleaned_sentence

In [58]:
#Test the function
assert process_sentence(test_string) == "i have a name"

##### Tokenize the sentences:
###### Step includes:

###### 1. Split the sentences into each word individually.
###### 2. Create a vocabulary to vectorize.

In [59]:
test_string_2 = "my mathematics for data science project is so awesome that everybody in my class loves it"

In [60]:
#Split individually to create a vocab
words = test_string_2.split()

In [61]:
sets = set(sorted(words))

dict = {word: index for index, word in enumerate(sets)}
print(dict)

{'that': 0, 'so': 1, 'awesome': 2, 'is': 3, 'it': 4, 'science': 5, 'loves': 6, 'class': 7, 'everybody': 8, 'my': 9, 'mathematics': 10, 'project': 11, 'for': 12, 'in': 13, 'data': 14}


In [62]:
def create_vocab(sentence):
    """ Return a dictionary of vocabulary for a list of words

    Argument:
    sentence: List of words that we want to create a dictionary of vocabulary.
    """
    unique_word_set: set = set(sorted(sentence))
    vocab: dict = {word: index for index, word in enumerate(unique_word_set)}

    return vocab

In [63]:
#assert create_vocab(words) == {'project': 0, 'is': 1, 'in': 2, 'it': 3, 'class': 4, 'loves': 5, 'mathematics': 6, 'science': 7, 'so': 8, 'for': 9, 'my': 10, 'data': 11, 'that': 12, 'awesome': 13, 'everybody': 14}

##### Convert into array:
###### Base on the created vocabulary to convert a sentence into an array

In [64]:
import numpy as np

vocabs = create_vocab(words)
arr_size_vocab = np.zeros(len(vocabs))
print(vocabs)
print(arr_size_vocab)

{'that': 0, 'so': 1, 'awesome': 2, 'is': 3, 'it': 4, 'science': 5, 'loves': 6, 'class': 7, 'everybody': 8, 'my': 9, 'mathematics': 10, 'project': 11, 'for': 12, 'in': 13, 'data': 14}
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [65]:
test_string_3 = "My project is awesome"

cleaned_2 = process_sentence(test_string_3)
words_2 = cleaned_2.split()

for word in words_2:
    if word in vocabs:
        arr_size_vocab[vocabs[word]] += 1

print(arr_size_vocab)

[0. 0. 1. 1. 0. 0. 0. 0. 0. 1. 0. 1. 0. 0. 0.]


In [66]:
def vectorize(words_list, vocabulary):
    """ Return the vector of the sentence.

    Argument:
    words_list: list of word to make a vector.
    vocabulary: a dictionary of vocabulary from the sentences.
    """
    sentence_vector: np.ndarray = np.zeros(len(vocabulary))

    for word in words_list:
        if word in vocabulary:
            sentence_vector[vocabulary[word]] += 1

    return sentence_vector

In [67]:
assert all(x == 0 or x == 1 for x in vectorize(words_2, vocabs))

#### Putting all together for vectorizing the sentences
##### This section summarizes all the previous steps before proceeding to the Cosine similarity calculation.

In [68]:
def process_sentence(sentence):
    """ Preprocess the sentence and convert into an array of words.

    Argument:
    sentence: The sentence we want to preprocess.

    Output: Array of words.
    """

    #Remove non-word
    cleaned_sentence: str = ''.join(char for char in sentence if char.isalpha() or char.isspace())

    #Remove redundant whitespaces
    cleaned_sentence = ' '.join(cleaned_sentence.split())

    #Convert to lowercase
    cleaned_sentence = cleaned_sentence.lower()

    return cleaned_sentence.split()

def create_vocab(sentences):
    """ Return a dictionary of vocabulary from two sentences

    Argument:
    sentences: List of sentences that we want to create a dictionary of vocabulary.
    """

    words = []
    for sentence in sentences:
        words_arr: np.ndarray = process_sentence(sentence)
        words += words_arr

    unique_word_set: set = set(sorted(words))
    vocab: dict = {word: index for index, word in enumerate(unique_word_set)}

    return vocab

def vectorize(sentence, vocabulary):
    """ Return the vector of the sentence.

    Argument:
    sentence: The sentence we want to vectorize.
    vocabulary: A dictionary of vocabulary from the predefined sentences.
    """
    sentence_vector: np.ndarray = np.zeros(len(vocabulary))

    words_list: np.ndarray = process_sentence(sentence)
    for word in words_list:
        if word in vocabulary:
            sentence_vector[vocabulary[word]] += 1

    return sentence_vector

In [69]:
sentence_1 = "The weather is hot today!"
sentence_2 = "It is a hot day."
sentences = np.array([sentence_1, sentence_2])

vocabulary = create_vocab(sentences)
print(vocabulary)

print(vectorize(sentence_1, vocabulary))
print(vectorize(sentence_2, vocabulary))

{'a': 0, 'the': 1, 'is': 2, 'it': 3, 'weather': 4, 'day': 5, 'hot': 6, 'today': 7}
[0. 1. 1. 0. 1. 0. 1. 1.]
[1. 0. 1. 1. 0. 1. 1. 0.]


In [70]:
vector_1 = vectorize(sentence_1, vocabulary)
vector_2 = vectorize(sentence_2, vocabulary)

## Calculate Cosine similarity

### The formula is:

$$
\cos(θ) =  \frac{A \cdot B}{\|A\| \times \|B\|}
$$

In [77]:
product: float = 0
product = sum(vector_1[i] * vector_2[i] for i in range(len(vector_1)))

print(product)

2.0


In [78]:
def calculate_dot_product(vector1, vector2):
    """ Return the dot product of two vectors.

    Argument:
    vector1 and vector2: two vectors.
    """
    dot_product: float = 0

    #Formula: dot = a1*b1 + a2*b2 ... + an*bn, with n is the size of the array.
    dot_product = sum(vector_1[i] * vector_2[i] for i in range(len(vector_1)))

    return dot_product

In [79]:
calculate_dot_product(vector_1, vector_2)

2.0

In [80]:
assert calculate_dot_product(vector_1, vector_2) == np.dot(vector_1, vector_2)

norm_1: float = 0
norm_2: float = 0

for i in range(len(vector_1)):
    norm_1 = np.sqrt()


## Compare

### We define the threshold for the similarity is 0.5.

In [None]:
#from numpy import dot
#from numpy.linalg import norm
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import string

def preprocess_sentence(sentence):
    sentence = sentence.lower()
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    words = sentence.split()
    return words

def create_vocabulary(sentences):
    vocabulary = set()
    for sentence in sentences:
        words = preprocess_sentence(sentence)
        vocabulary.update(words)
    return {word: index for index, word in enumerate(vocabulary)}

def vectorize_sentence(sentence, vocabulary):
    vector = [0] * len(vocabulary)
    words = preprocess_sentence(sentence)
    for word in words:
        if word in vocabulary:
            vector[vocabulary[word]] += 1
    return vector

sentences = ["This is the first sentence.", "This is the second sentence."]
vocabulary = create_vocabulary(sentences)

vector1 = vectorize_sentence(sentences[0], vocabulary)
vector2 = vectorize_sentence(sentences[1], vocabulary)

print(vector1)
print(vector2)
#print(vocabulary)

In [None]:
similitude = dot(vector1, vector2)/(norm(vector1)*norm(vector2))
similitude_percentage = similitude * 100
print('Degree of similarity: {:.2f}%'.format(similitude_percentage))

### Visualization

In [None]:
angle_rad = np.arccos(similitude)
angle_deg = np.degrees(angle_rad)

plt.figure(figsize=(6, 6))

# Plot Vector 1 as an arrow
plt.arrow(0, 0, vector1[0], vector1[1], color='blue', width=0.05, label='Vector 1', alpha=0.5)
# Plot Vector 2 as an arrow
plt.arrow(0, 0, vector2[0], vector2[1], color='orange', width=0.05, label='Vector 2', alpha=0.5)

plt.xlim(-0.5, 1.5)
plt.ylim(-0.5, 1.5)

# Add the angle between vectors as text annotation
plt.text(0.1, 1.2, f'Angle: {angle_deg:.2f}°', fontsize=12)

plt.xlabel('Dimension 1')
plt.ylabel('Dimension 2')
plt.title('Vector Representation and Angle')
plt.legend()
plt.grid(True)
plt.tight_layout()

plt.show()
