# TFIDF

In [3]:
import math
from collections import Counter

# Step 1: Define a small corpus (set of documents)
corpus = [
    "The cat sat on the mat.",
    "The dog sat on the log.",
    "Dogs and cats are great pets.",
    "I love my pet cat and dog."
]

# Step 2: Preprocess the text (tokenization and lowercase)
def tokenize(text):
    return text.lower().replace('.', '').split()

tokenized_corpus = [tokenize(doc) for doc in corpus]

# Step 3: Calculate Term Frequency (TF)
def compute_tf(document):
    tf_scores = {}
    total_terms = len(document)
    term_counts = Counter(document)
    for term, count in term_counts.items():
        tf_scores[term] = count / total_terms
    return tf_scores, term_counts, total_terms

# Step 4: Calculate Inverse Document Frequency (IDF)
def compute_idf(corpus):
    idf_scores = {}
    total_documents = len(corpus)
    all_terms = set(term for document in corpus for term in document)
    for term in all_terms:
        containing_docs = sum(1 for document in corpus if term in document)
        idf_scores[term] = math.log(total_documents / (1 + containing_docs)) + 1  # Smoothing
    return idf_scores

# Step 5: Calculate TF-IDF for each document
def compute_tf_idf(corpus):
    idf_scores = compute_idf(corpus)
    tf_idf_corpus = []
    for document in corpus:
        tf_scores, term_counts, total_terms = compute_tf(document)
        tf_idf_scores = {term: tf_scores.get(term, 0) * idf_scores[term] for term in idf_scores}
        tf_idf_corpus.append((tf_idf_scores, term_counts, total_terms))
    return tf_idf_corpus, idf_scores

# Calculate TF-IDF and IDF
tf_idf_corpus, idf_scores = compute_tf_idf(tokenized_corpus)

# Step 6: Display calculations for a specific word
word = "cat"  # You can change this to any word of interest

print(f"Calculations for the word: '{word}'\n")
for i, (doc_scores, term_counts, total_terms) in enumerate(tf_idf_corpus):
    tf = term_counts.get(word, 0) / total_terms if word in term_counts else 0
    idf = idf_scores.get(word, 0)
    tf_idf = tf * idf
    doc_occurrences = term_counts.get(word, 0)
    total_docs_with_word = sum(1 for document in tokenized_corpus if word in document)
    print(f"Document {i+1}:")
    print(f"Occurrences of '{word}' in document: {doc_occurrences} out of {total_terms} total words")
    print(f"TF (Term Frequency): {doc_occurrences}/{total_terms} = {tf:.4f}")
    print(f"Occurrences of '{word}' in corpus: {total_docs_with_word} out of {len(corpus)} documents")
    print(f"IDF (Inverse Document Frequency): log({len(corpus)} / (1 + {total_docs_with_word})) + 1 = {idf:.4f}")
    print(f"TF-IDF: {tf:.4f} * {idf:.4f} = {tf_idf:.4f}\n")

# Explanation:
# - TF (Term Frequency): How often a word appears in a document divided by total terms.
# - IDF (Inverse Document Frequency): Measures how unique a word is across all documents.
# - TF-IDF: Higher scores for words that are frequent in one document but rare across the corpus.


Calculations for the word: 'cat'

Document 1:
Occurrences of 'cat' in document: 1 out of 6 total words
TF (Term Frequency): 1/6 = 0.1667
Occurrences of 'cat' in corpus: 2 out of 4 documents
IDF (Inverse Document Frequency): log(4 / (1 + 2)) + 1 = 1.2877
TF-IDF: 0.1667 * 1.2877 = 0.2146

Document 2:
Occurrences of 'cat' in document: 0 out of 6 total words
TF (Term Frequency): 0/6 = 0.0000
Occurrences of 'cat' in corpus: 2 out of 4 documents
IDF (Inverse Document Frequency): log(4 / (1 + 2)) + 1 = 1.2877
TF-IDF: 0.0000 * 1.2877 = 0.0000

Document 3:
Occurrences of 'cat' in document: 0 out of 6 total words
TF (Term Frequency): 0/6 = 0.0000
Occurrences of 'cat' in corpus: 2 out of 4 documents
IDF (Inverse Document Frequency): log(4 / (1 + 2)) + 1 = 1.2877
TF-IDF: 0.0000 * 1.2877 = 0.0000

Document 4:
Occurrences of 'cat' in document: 1 out of 7 total words
TF (Term Frequency): 1/7 = 0.1429
Occurrences of 'cat' in corpus: 2 out of 4 documents
IDF (Inverse Document Frequency): log(4 / (1 + 2

In [4]:
import math
from collections import Counter
import re

# Step 1: Define a small corpus with regex pattern matching examples
corpus = [
    "My email is john.doe@example.com and my backup is doe.john@work.org.",
    "Reach out at contact@company.net for inquiries.",
    "Invalid emails like user@@domain..com should not be captured.",
    "Another valid email: support@service.co.uk and one more: hello@domain.io."
]

# Step 2: Preprocess the text (tokenization and lowercase)
def tokenize(text):
    return text.lower().replace('.', '').split()

tokenized_corpus = [tokenize(doc) for doc in corpus]

# Step 3: Define regex pattern for email capture
email_pattern = r'\b[\w.-]+@[\w.-]+\.[a-zA-Z]{2,}\b'

# Initialize counters for TP, TN, FP, FN
TP = 0  # Correctly captured valid emails
TN = 0  # Correctly not captured invalid parts
FP = 0  # Incorrectly captured invalid parts
FN = 0  # Missed valid emails

# Define actual valid emails manually for this example
valid_emails = {
    "john.doe@example.com", "doe.john@work.org",
    "contact@company.net", "support@service.co.uk", "hello@domain.io"
}

# Step 4: Perform regex matching and classification
for i, text in enumerate(corpus):
    matches = set(re.findall(email_pattern, text))
    expected = set(email for email in valid_emails if email in text)
    false_matches = matches - expected  # Captured but invalid
    missed_matches = expected - matches  # Valid but not captured
    correct_matches = matches & expected  # Correctly captured valid emails

    TP += len(correct_matches)
    FP += len(false_matches)
    FN += len(missed_matches)
    TN += len(expected) - len(correct_matches)

# Step 5: Print the confusion matrix counts
print("Regex Matching Evaluation:")
print(f"True Positives (TP): {TP}")
print(f"True Negatives (TN): {TN}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}\n")

# Step 6: Calculate Precision, Recall, and F1 Score
precision = TP / (TP + FP) if (TP + FP) != 0 else 0
recall = TP / (TP + FN) if (TP + FN) != 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) != 0 else 0

# Step 7: Print calculated metrics with explanations
print("Performance Metrics:")
print(f"Precision: {TP} / ({TP} + {FP}) = {precision:.4f}")
print(f"Recall: {TP} / ({TP} + {FN}) = {recall:.4f}")
print(f"F1 Score: 2 * ({precision:.4f} * {recall:.4f}) / ({precision:.4f} + {recall:.4f}) = {f1_score:.4f}\n")

# Explanation:
# - TP: Correctly captured valid emails by regex.
# - TN: Non-email text correctly ignored by regex.
# - FP: Incorrect text captured as emails by regex.
# - FN: Valid emails not captured by regex.
#
# - Precision: Measures accuracy of captured emails.
# - Recall: Measures ability to capture all valid emails.
# - F1 Score: Balances precision and recall performance.


Regex Matching Evaluation:
True Positives (TP): 5
True Negatives (TN): 0
False Positives (FP): 0
False Negatives (FN): 0

Performance Metrics:
Precision: 5 / (5 + 0) = 1.0000
Recall: 5 / (5 + 0) = 1.0000
F1 Score: 2 * (1.0000 * 1.0000) / (1.0000 + 1.0000) = 1.0000

