In [None]:
from google.colab import drive
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from collections import Counter
import math

# Mount Google Drive
#drive.mount('/content/drive')

# Training data
train_docs = [
    ("free free free buy discount combo pleasure", "S"),
    ("free free free discount pleasure smile smile smile", "S"),
    ("cat mouse", "N"),
    ("cat cat dog dog dog dog", "N"),
    ("mouse", "N")
]

# Test data
test_docs = [
    ("dog cat mouse cat", "???"),
    ("Free free smile", "???")
]

# Preprocessing to lowercase
train_docs = [(doc.lower(), label) for doc, label in train_docs]
test_docs = [(doc.lower(), label) for doc, label in test_docs]

def calculate_mi(word, docs):
    N = len(docs)
    word_counts = Counter(word in doc.split() for doc, _ in docs)
    class_counts = Counter(label for _, label in docs)

    P_W = word_counts[True] / N
    P_not_W = word_counts[False] / N

    mi = 0
    for c in class_counts:
        P_C = class_counts[c] / N
        P_W_C = sum(word in doc.split() and label == c for doc, label in docs) / N
        P_not_W_C = sum(word not in doc.split() and label == c for doc, label in docs) / N

        if P_W_C > 0:
            mi += P_W_C * math.log(P_W_C / (P_W * P_C), 2)
        if P_not_W_C > 0:
            mi += P_not_W_C * math.log(P_not_W_C / (P_not_W * P_C), 2)

    #print(word_counts )
    #print(class_counts)
    #print(mi)
    return mi

# Calculate MI for each word in the training data
all_words = set(word for doc, _ in train_docs for word in doc.split())
mi_scores = {word: calculate_mi(word, train_docs) for word in all_words}

#print(all_words)
#print(mi_scores)

# Sort and select top 2 words
selected_features = sorted(mi_scores, key=mi_scores.get, reverse=True)[:2]
print("Selected Features:", selected_features)

def compute_tfidf(docs, selected_features):
    tfidf_vectorizer = TfidfVectorizer(vocabulary=selected_features)
    tfidf_matrix = tfidf_vectorizer.fit_transform([doc for doc, _ in docs])
    return tfidf_matrix.toarray()

train_tfidf = compute_tfidf(train_docs, selected_features)
print("TF*IDF Matrix (Train):\n", train_tfidf)

# Form the representation matrix for training documents
train_labels = [label for _, label in train_docs]

# Form the TF*IDF matrix for test documents
test_tfidf = compute_tfidf(test_docs, selected_features)
print("TF*IDF Matrix (Test):\n", test_tfidf)

# Initialize and train KNN
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(train_tfidf, train_labels)

print(test_docs)
# Predict the class labels for the test documents
test_predictions = knn.predict(test_tfidf)
print("Predicted Classes:", test_predictions)


Selected Features: ['free', 'discount']
TF*IDF Matrix (Train):
 [[0.9486833  0.31622777]
 [0.9486833  0.31622777]
 [0.         0.        ]
 [0.         0.        ]
 [0.         0.        ]]
TF*IDF Matrix (Test):
 [[0. 0.]
 [1. 0.]]
[('dog cat mouse cat', '???'), ('free free smile', '???')]
Predicted Classes: ['N' 'S']
