# Analysing Seed Dataset

In this notebook we'll use the seed dataset we generated to bootstrap a classifier.

In [1]:
import sys
import json
sys.path.insert(0, '../code')

import numpy as np
import pandas as pd

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

from build_seed_set import load_seed_dataset

from scipy import sparse
from sklearn.preprocessing import normalize
from sklearn.datasets import make_regression
from sklearn.multioutput import MultiOutputRegressor

In [2]:
seed_dataset = load_seed_dataset('../data/seed_set_data.csv')

INFO:root:Saving DataFrame to ../data/seed_set_data.csv


In [3]:
seed_dataset

Unnamed: 0,sense_key,synset_id,word,definition,examples,prompt,generated_text,generated_examples
0,action%1:04:02::,action.n.01,action,something done (usually as opposed to somethin...,[there were stories of murders and other unnat...,"Below is one definition for ""action"" with exam...","Below is one definition for ""action"" with exam...",[there were stories of murders and other unnat...
1,action%1:04:04::,action.n.09,action,an act by a government body or supranational o...,[recent federal action undermined the segregat...,"Below is one definition for ""action"" with exam...","Below is one definition for ""action"" with exam...",[recent federal action undermined the segregat...
2,activist%1:18:00::,militant.n.01,militant,a militant reformer,[],"Below is one definition for ""militant"" with ex...","Below is one definition for ""militant"" with ex...",[The militant activist was arrested during a p...
3,advance%1:11:01::,improvement.n.01,improvement,a change for the better; progress in development,[],"Below is one definition for ""improvement"" with...","Below is one definition for ""improvement"" with...",[The new employee has made significant improve...
4,adviser%1:18:00::,adviser.n.01,adviser,an expert who gives advice,[an adviser helped students select their cours...,"Below is one definition for ""adviser"" with exa...","Below is one definition for ""adviser"" with exa...",[an adviser helped students select their cours...
...,...,...,...,...,...,...,...,...
126,week%1:28:02::,week.n.03,week,a period of seven consecutive days starting on...,[],"Below is one definition for ""week"" with exampl...","Below is one definition for ""week"" with exampl...","[I have a big project due next week., She work..."
127,working_group%1:14:00::,working_group.n.01,working_group,a group of people working together temporarily...,[the working group was supposed to report back...,"Below is one definition for ""working_group"" wi...","Below is one definition for ""working_group"" wi...",[the working group was supposed to report back...
128,world%1:05:00::,world.n.08,world,all of the living human inhabitants of the earth,"[all the world loves a lover, she always used ...","Below is one definition for ""world"" with examp...","Below is one definition for ""world"" with examp...","[all the world loves a lover, she always used ..."
129,world%1:17:00::,earth.n.01,earth,the 3rd planet from the sun; the planet we liv...,"[the Earth moves around the sun, he sailed aro...","Below is one definition for ""earth"" with examp...","Below is one definition for ""earth"" with examp...","[the Earth moves around the sun, he sailed aro..."


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [5]:
seed_examples = seed_dataset[['word', 'synset_id', 'generated_examples', 'examples']].copy()
seed_examples['text'] = seed_examples.apply(lambda x: x.examples + x.generated_examples, axis=1)
seed_examples = seed_examples.loc[:, ['word', 'synset_id', 'text']]
seed_examples = seed_examples.explode('text')
seed_examples = seed_examples[seed_examples.apply(lambda x: x.word in x.text, axis=1)]
seed_examples = seed_examples.reset_index(drop=True)
seed_examples

Unnamed: 0,word,synset_id,text
0,action,action.n.01,there were stories of murders and other unnatu...
1,action,action.n.01,there were stories of murders and other unnatu...
2,action,action.n.01,The company took action to address the employe...
3,action,action.n.01,The government took action to stabilize the ec...
4,action,action.n.01,The police took action against the criminal or...
...,...,...,...
569,world,world.n.08,"The world is home to many different languages,..."
570,year,year.n.01,she is 4 years old
571,year,year.n.01,in the year 1920
572,year,year.n.01,she is 4 years old


In [6]:
seed_examples.synset_id.value_counts(ascending=False)

synset_id
meeting.n.01      11
text.n.01         11
state.n.04        10
capital.n.06      10
plan.n.01          9
                  ..
procedure.n.01     2
talk.n.01          2
kind.n.01          2
show.n.01          2
plot.n.01          2
Name: count, Length: 100, dtype: int64

In [7]:
class ModelFactory:
    def __init__(self, texts, labels, tokenizer, stop_words, C=1e12, class_weight='balanced', test_size=0.2):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.stop_words = stop_words
        self.C = C
        self.class_weight = class_weight
        self.test_size = test_size

    def lemmatize_tokenize(self, text):
        tokens = nltk.word_tokenize(text)
        lemmatized_tokens = [self.tokenizer.lemmatize(token) for token in tokens if not token in self.stop_words]
        return lemmatized_tokens

    def train(self):
        X_train, X_test, y_train, y_test = train_test_split(self.texts, self.labels, test_size=self.test_size)

        vectorizer = CountVectorizer(tokenizer=self.lemmatize_tokenize)
        X_train_counts = vectorizer.fit_transform(X_train)

        clf = MultiOutputRegressor(LogisticRegression(C=self.C, class_weight=self.class_weight))
        clf.fit(X_train_counts, y_train)

        return clf, vectorizer, X_train_counts, X_test, y_train, y_test

    def evaluate(self, clf, vectorizer, X_train_counts, X_test, y_train, y_test):
        train_predictions = clf.predict(X_train_counts)
        preds = np.argmax(train_predictions, axis=1)
        targets = np.argmax(y_train, axis=1)
        train_accuracy = np.mean(preds == targets)

        X_test_counts = vectorizer.transform(X_test)
        test_predictions = clf.predict(X_test_counts)
        test_preds = np.argmax(test_predictions, axis=1)
        test_targets = np.argmax(y_test, axis=1)
        test_accuracy = np.mean(test_preds == test_targets)

        return train_accuracy, test_accuracy

# Example usage
lemmatizer = WordNetLemmatizer()
factory = ModelFactory(
    texts=seed_examples.text,
    labels=pd.get_dummies(seed_examples['synset_id']) * 1,
    tokenizer=lemmatizer,
    stop_words=stop_words
)
clf, vectorizer, X_train_counts, X_test, y_train, y_test = factory.train()
train_accuracy, test_accuracy = factory.evaluate(clf, vectorizer, X_train_counts, X_test, y_train, y_test)

print(f"Train Accuracy: {train_accuracy}, Test Accuracy: {test_accuracy}")



Train Accuracy: 1.0, Test Accuracy: 0.5130434782608696
