# **TF - IDF (without inbuilt)**

In [None]:
import math
import string
import pandas as pd

def preprocess_document(doc):
    translator = str.maketrans('', '', string.punctuation)
    doc = doc.lower().translate(translator)
    terms = doc.split()
    return terms

def calculate_tf(doc):
    tf = {}
    total_terms = len(doc)
    for term in doc:
        tf[term] = tf.get(term, 0) + 1 / total_terms
    return tf

def calculate_idf(documents):
    N = len(documents)
    idf = {}
    term_document_count = {}
    for doc in documents:
        unique_terms = set(doc)
        for term in unique_terms:
            term_document_count[term] = term_document_count.get(term, 0) + 1
    for term, count in term_document_count.items():
        idf[term] = math.log(N / count)
    return idf

def calculate_tfidf(documents):
    tfidf = []
    idf = calculate_idf(documents)
    for doc in documents:
        tf = calculate_tf(doc)
        doc_tfidf = {}
        for term in doc:
            if term in tf and term in idf:
                doc_tfidf[term] = tf[term] * idf[term]
        tfidf.append(doc_tfidf)
    return tfidf

documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

preprocessed_docs = [preprocess_document(doc) for doc in documents]
tfidf_scores = calculate_tfidf(preprocessed_docs)

df_tfidf = pd.DataFrame(tfidf_scores)
df_tfidf.columns = sorted(set(term for doc in preprocessed_docs for term in doc))

tf_scores = []
idf_scores = calculate_idf(preprocessed_docs)
for doc in preprocessed_docs:
    tf_doc = calculate_tf(doc)
    tf_scores.append(tf_doc)

df_scores = pd.DataFrame()
for i, doc in enumerate(preprocessed_docs):
    doc_scores = pd.DataFrame({
        'Term': doc,
        'TF': [tf_scores[i].get(term, 0) for term in doc],
        'IDF': [idf_scores.get(term, 0) for term in doc],
        'TF-IDF': [tf_scores[i].get(term, 0) * idf_scores.get(term, 0) for term in doc]
    })
    df_scores = pd.concat([df_scores, doc_scores], ignore_index=True)

print(df_scores)

        Term        TF       IDF    TF-IDF
0       this  0.200000  0.000000  0.000000
1         is  0.200000  0.000000  0.000000
2        the  0.200000  0.000000  0.000000
3      first  0.200000  0.693147  0.138629
4   document  0.200000  0.287682  0.057536
5       this  0.166667  0.000000  0.000000
6   document  0.333333  0.287682  0.095894
7         is  0.166667  0.000000  0.000000
8        the  0.166667  0.000000  0.000000
9     second  0.166667  1.386294  0.231049
10  document  0.333333  0.287682  0.095894
11       and  0.166667  1.386294  0.231049
12      this  0.166667  0.000000  0.000000
13        is  0.166667  0.000000  0.000000
14       the  0.166667  0.000000  0.000000
15     third  0.166667  1.386294  0.231049
16       one  0.166667  1.386294  0.231049
17        is  0.200000  0.000000  0.000000
18      this  0.200000  0.000000  0.000000
19       the  0.200000  0.000000  0.000000
20     first  0.200000  0.693147  0.138629
21  document  0.200000  0.287682  0.057536


# **BAG OF WORDS**

In [None]:
sentences = [
    "I love natural language processing.",
    "Natural language understanding is fascinating.",
    "Processing and understanding are key NLP tasks.",
    "this is my asssignment.",
]

def tokenize(sentence):
    return sentence.lower().split()

vocabulary = set()
for sentence in sentences:
    words = tokenize(sentence)
    vocabulary.update(words)

bow_matrix = []

for sentence in sentences:
    words = tokenize(sentence)
    bow_vector = [0] * len(vocabulary)
    for word in words:
        if word in vocabulary:
            index = list(vocabulary).index(word)
            bow_vector[index] += 1
    bow_matrix.append(bow_vector)

for i, vector in enumerate(bow_matrix):
    print(f"Sentence {i+1} BoW Vector:", vector)

print("Vocabulary:", list(vocabulary))

Sentence 1 BoW Vector: [0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1]
Sentence 2 BoW Vector: [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1]
Sentence 3 BoW Vector: [0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 0, 0]
Sentence 4 BoW Vector: [1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]
Vocabulary: ['this', 'my', 'and', 'processing.', 'tasks.', 'love', 'asssignment.', 'i', 'key', 'is', 'are', 'understanding', 'processing', 'fascinating.', 'nlp', 'natural', 'language']


# **N-GRAMS**

In [None]:
def generate_ngrams(text, n):
    words = text.split()
    ngrams = []
    if n > len(words):
        return []
    for i in range(len(words) - n + 1):
        ngram = " ".join(words[i:i + n])
        ngrams.append(ngram)
    return ngrams

text = "This is an example sentence for generating n-grams without built-in functions in Python"
n = 3
result = generate_ngrams(text, n)
for ngram in result:
    print(ngram)

This is an
is an example
an example sentence
example sentence for
sentence for generating
for generating n-grams
generating n-grams without
n-grams without built-in
without built-in functions
built-in functions in
functions in Python


# **COSINE SIMALARITY**

In [None]:
import numpy as np

sentence1 = "I enjoy playing football with my friends."
sentence2 = "He enjoy to play basketball."

words1 = sentence1.split()
words2 = sentence2.split()

unique_words = set(words1 + words2)

vector1 = [words1.count(word) for word in unique_words]
vector2 = [words2.count(word) for word in unique_words]

dot_product = sum(vector1[i] * vector2[i] for i in range(len(unique_words)))

norm1 = np.linalg.norm(vector1)
norm2 = np.linalg.norm(vector2)

cosine_similarity = dot_product / (norm1 * norm2)

print("Cosine Similarity:", cosine_similarity)

Cosine Similarity: 0.16903085094570328
