<a href="https://colab.research.google.com/github/mind-matrix/research-8th-sem-project/blob/main/Simple_Bayesian_Relation_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from operator import mul
import json
from tqdm import tqdm
from functools import reduce
from sklearn.model_selection import train_test_split

In [2]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
def train(df, save_to="./probabilities.json"):
    P_k_count = df['target'].value_counts().to_dict()
    P_k_N = sum(P_k_count.values())
    P_k = dict(map(lambda x: (x[0], x[1]/P_k_N), P_k_count.items()))

    P_w_k = dict()

    P_w_nk = dict()

    P_w_count = dict()

    with tqdm(total=len(df.index)) as pbar:
        for i, row in df.iterrows():
            pbar.update(1)
            context = row["context"].replace(row["h"], "").replace(row["t"], "")
            words = word_tokenize(context)
            tokens = [ w.lower() for w in words if w.lower() not in stop_words ]
            target = row["target"]
            for token in tokens:
                if token in P_w_count:
                    P_w_count[token] += 1
                else:
                    P_w_count[token] = 1
                
                if token in P_w_k:
                    if target in P_w_k[token]:
                        P_w_k[token][target] += 1
                    else:
                        P_w_k[token][target] = 1
                else:
                    P_w_k[token] = dict()
                    P_w_k[token][target] = 1
                
                for ntarget in P_k.keys():
                    if token in P_w_nk:
                        if target in P_w_nk[token]:
                            P_w_nk[token][target] += 1
                        else:
                            P_w_nk[token][target] = 1
                    else:
                        P_w_nk[token] = dict()
                        P_w_nk[token][target] = 1
    
    P_w_N = sum(P_w_count.values())
    P_w = dict(map(lambda x: (x[0], x[1]/P_w_N), P_w_count.items()))

    for w in P_w_k.keys():
        P_w_k[w] = dict(map(lambda x: ( x[0], x[1] * P_w[w]/ (x[1] + (P_w_nk[w][x[0]] if w in P_w_nk and x[0] in P_w_nk else 0)) ), P_w_k[w].items()))

    for w in P_w_nk.keys():
        P_w_nk[w] = dict(map(lambda x: ( x[0], x[1] * P_w[w] / (x[1] + (P_w_k[w][x[0]] if w in P_w_k and x[0] in P_w_k[w] else 0)) ), P_w_nk[w].items()))
    
    with open(save_to, 'w') as fd:
        json.dump({ 'P_w_k': P_w_k, 'P_w_nk': P_w_nk, 'P_k': P_k }, fd)

In [4]:
def find_class(raw_context, h, t, model_path="./probabilities.json", model=None):
    if model is None:
        with open(model_path, 'r') as fd:
            model = json.load(fd)
    P_w_k = model['P_w_k']
    P_w_nk = model['P_w_nk']
    P_k = model['P_k']
    context = raw_context.replace(h, "").replace(t, "")
    words = word_tokenize(context)
    tokens = [ w.lower() for w in words if w.lower() not in stop_words ]
    P_k_ws = dict()
    for k in P_k.keys():
        t = reduce(mul, [ P_w_k[token][k] if token in P_w_k and k in P_w_k[token] else 0 for token in tokens ], 1) * P_k[k]
        b = reduce(mul, [ P_w_k[token][k] if token in P_w_k and k in P_w_k[token] else 0 for token in tokens ], 1) * P_k[k] + reduce(mul, [ P_w_nk[token][k] if token in P_w_nk and k in P_w_nk[token] else 0 for token in tokens ], 1) * (1 - P_k[k])
        if b == 0:
            P_k_ws[k] = 0
        else:
            P_k_ws[k] = t / b
    return P_k_ws, max(P_k_ws, key=P_k_ws.get)

In [5]:
def test(df):
    with open("./probabilities.json", 'r') as fd:
        model = json.load(fd)
    score = 0
    with tqdm(total=len(df.index)) as pbar:
        for i, row in df.iterrows():
            pbar.update(1)
            probs, target = find_class(row["context"],row["h"],row["t"], model=model)
            if int(target) == int(row["target"]):
                score += 1
    return score / len(df.index)

In [6]:
df = pd.read_csv('./dataset-annotated.csv')

In [7]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

In [8]:
# Training
train(train_df)

100%|██████████| 9858/9858 [00:54<00:00, 180.40it/s]


In [9]:
# Testing
accuracy = test(test_df)
print(f'test accuracy: {accuracy}')

100%|██████████| 2465/2465 [00:34<00:00, 70.59it/s]

test accuracy: 0.2385395537525355



