## How it works?

- Dot product
- Magnitude - Vector length
- Normalization: Cosine similarity

## From scratch

In [None]:
import math

In [None]:
def cosine_similarity(v1, v2):
    if len(v1) != len(v2):
        raise ValueError("Vectors must have the same dimension")

    dot_product = sum(x * y for x, y in zip(v1, v2))

    magnitude1 = math.sqrt(sum(x * x for x in v1))
    magnitude2 = math.sqrt(sum(x * x for x in v2))

    if magnitude1 == 0 or magnitude2 == 0:
        return 0

    return dot_product / (magnitude1 * magnitude2)

In [None]:
vector1 = [2, 5, 1]
vector2 = [1, 4, 2]

similarity = cosine_similarity(vector1, vector2)
print(f"Cosine similarity: {similarity:.4f}")

Cosine similarity: 0.9562


In [None]:
import numpy as np

dot_product = sum(x * y for x, y in zip(vector1, vector2))

In [None]:
math.sqrt(30)* math.sqrt(21)

25.099800796022265

In [None]:
24/25.09

0.9565563969709048

## Text data Example - Sklearn

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity as sklearn_cosine_similarity
import numpy as np

In [None]:
docs = [
    "Python is a popular programming language for data science and machine learning.",
    "Data scientists use Python libraries like NumPy, Pandas, and Scikit-learn.",
    "JavaScript is essential for web development and creating interactive websites.",
    "Natural language processing helps computers understand human language.",
    "Deep learning models can process differnt data i.e., text, images, and audio data effectively."
]

In [None]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(docs)

In [None]:
X.toarray()

array([[1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
        0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1,
        1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0],
       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [1, 1, 1, 0, 0, 2, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]])

In [None]:
X.toarray().shape

(5, 41)

In [None]:
vectorizer.get_feature_names_out()

array(['and', 'audio', 'can', 'computers', 'creating', 'data', 'deep',
       'development', 'differnt', 'effectively', 'essential', 'for',
       'helps', 'human', 'images', 'interactive', 'is', 'javascript',
       'language', 'learn', 'learning', 'libraries', 'like', 'machine',
       'models', 'natural', 'numpy', 'pandas', 'popular', 'process',
       'processing', 'programming', 'python', 'science', 'scientists',
       'scikit', 'text', 'understand', 'use', 'web', 'websites'],
      dtype=object)

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

In [None]:
df

In [None]:
similarity_matrix = sklearn_cosine_similarity(X)

In [None]:
similarity_matrix

array([[1.        , 0.27272727, 0.28603878, 0.19069252, 0.31139958],
       [0.27272727, 1.        , 0.09534626, 0.        , 0.23354968],
       [0.28603878, 0.09534626, 1.        , 0.        , 0.08164966],
       [0.19069252, 0.        , 0.        , 1.        , 0.        ],
       [0.31139958, 0.23354968, 0.08164966, 0.        , 1.        ]])

In [None]:
similarity_matrix.shape

(5, 5)

In [None]:
for i in range(len(docs)):
    for j in range(i+1, len(docs)):
        print(f"Similarity between Doc {i+1} and Doc {j+1}: {similarity_matrix[i][j]:.4f}")

Similarity between Doc 1 and Doc 2: 0.2727
Similarity between Doc 1 and Doc 3: 0.2860
Similarity between Doc 1 and Doc 4: 0.1907
Similarity between Doc 1 and Doc 5: 0.3114
Similarity between Doc 2 and Doc 3: 0.0953
Similarity between Doc 2 and Doc 4: 0.0000
Similarity between Doc 2 and Doc 5: 0.2335
Similarity between Doc 3 and Doc 4: 0.0000
Similarity between Doc 3 and Doc 5: 0.0816
Similarity between Doc 4 and Doc 5: 0.0000
