# Analysing Seed Dataset

In this notebook we'll use the seed dataset we generated to bootstrap a classifier.

In [1]:
# ! pip install -r ../requirements.txt

In [5]:
! pip3 install nltk



In [8]:
import sys
import json
sys.path.insert(0, '../code')

import numpy as np
import pandas as pd

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from build_seed_set import load_seed_dataset

from scipy import sparse
from sklearn.preprocessing import normalize
from sklearn.datasets import make_regression
from sklearn.multioutput import MultiOutputRegressor

ModuleNotFoundError: No module named 'regex._regex'

In [None]:
seed_dataset = load_seed_dataset('../data/seed_set_data.csv')

In [None]:
seed_dataset

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [None]:
seed_examples = seed_dataset[['word', 'synset_id', 'generated_examples', 'examples']].copy()
seed_examples['text'] = seed_examples.apply(lambda x: x.examples + x.generated_examples, axis=1)
seed_examples = seed_examples.loc[:, ['word', 'synset_id', 'text']]
seed_examples = seed_examples.explode('text')
seed_examples = seed_examples[seed_examples.apply(lambda x: x.word in x.text, axis=1)]
seed_examples = seed_examples.reset_index(drop=True)
seed_examples

In [None]:
seed_examples.synset_id.value_counts(ascending=False)

In [None]:
class ModelFactory:
    def __init__(self, texts, labels, tokenizer, stop_words, C=1e12, class_weight='balanced', test_size=0.2):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.stop_words = stop_words
        self.C = C
        self.class_weight = class_weight
        self.test_size = test_size

    def lemmatize_tokenize(self, text):
        tokens = nltk.word_tokenize(text)
        lemmatized_tokens = [self.tokenizer.lemmatize(token) for token in tokens if not token in self.stop_words]
        return lemmatized_tokens

    def train(self):
        X_train, X_test, y_train, y_test = train_test_split(self.texts, self.labels, test_size=self.test_size)

        vectorizer = CountVectorizer(tokenizer=self.lemmatize_tokenize)
        X_train_counts = vectorizer.fit_transform(X_train)

        clf = MultiOutputRegressor(LogisticRegression(C=self.C, class_weight=self.class_weight))
        clf.fit(X_train_counts, y_train)

        return clf, vectorizer, X_train_counts, X_test, y_train, y_test

    def evaluate(self, clf, vectorizer, X_train_counts, X_test, y_train, y_test):
        train_predictions = clf.predict(X_train_counts)
        preds = np.argmax(train_predictions, axis=1)
        targets = np.argmax(y_train, axis=1)
        train_accuracy = np.mean(preds == targets)

        X_test_counts = vectorizer.transform(X_test)
        test_predictions = clf.predict(X_test_counts)
        test_preds = np.argmax(test_predictions, axis=1)
        test_targets = np.argmax(y_test, axis=1)
        test_accuracy = np.mean(test_preds == test_targets)

        return train_accuracy, test_accuracy

# Example usage
lemmatizer = WordNetLemmatizer()
factory = ModelFactory(
    texts=seed_examples.text,
    labels=pd.get_dummies(seed_examples['synset_id']) * 1,
    tokenizer=lemmatizer,
    stop_words=stop_words
)
clf, vectorizer, X_train_counts, X_test, y_train, y_test = factory.train()
train_accuracy, test_accuracy = factory.evaluate(clf, vectorizer, X_train_counts, X_test, y_train, y_test)

print(f"Train Accuracy: {train_accuracy}, Test Accuracy: {test_accuracy}")