# Bird Fact Analysis

Now that we've successfully generated some bird facts, we need to write some code to extract the info and build a train-dev-test split.

In [2]:
import os
import re
import yaml
import pandas as pd

In [3]:
bird_data = pd.read_csv('../data/bird_data.csv')

First we can check if the facts have been extracted correctly:

In [5]:
bird_data.groupby(['Bird', 'Fact Type']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Fact
Bird,Fact Type,Unnamed: 2_level_1
bellbird,fake,50
bellbird,real,50
fantail,fake,50
fantail,real,50
kea,fake,50
kea,real,50
kererū,fake,50
kererū,real,50
kiwi,fake,50
kiwi,real,50


And we'll take a look at the table, just to see if it makes sense:

In [12]:
bird_data.groupby(['Fact Type']).sample(10).sort_values(['Bird', 'Fact Type'])

Unnamed: 0,Bird,Fact Type,Fact
383,bellbird,fake,The Bellbird (Korimako) can survive in the vac...
313,bellbird,real,The Bellbird (Korimako) is a territorial bird ...
263,fantail,fake,The Fantail's wings are covered in a thin laye...
274,fantail,fake,"Fantails are known to be very playful birds, o..."
119,kererū,real,The Kererū is a year-round resident in New Zea...
942,kiwi,real,The Kiwi bird has a highly developed sense of ...
906,kiwi,real,"Kiwi birds are nocturnal, meaning they are mos..."
900,kiwi,real,The Kiwi bird is native to New Zealand and is ...
675,new_zealand_falcon,fake,"Kārearea has a special bond with the moon, and..."
674,new_zealand_falcon,fake,The New Zealand Falcon (Kārearea) can see in i...


## Train-test split

We are going to split the dataset into 3 sets - Train, dev and test. We'll do it in 60/20/20 fashion, with an even split across each of the categories (birds + real/fake).

In [21]:
from sklearn.model_selection import train_test_split

# Create a new column for stratification
bird_data['strata'] = bird_data['Bird'] + "_" + bird_data['Fact Type']

# Splitting the dataset 60/40 first
train, temp = train_test_split(bird_data, test_size=0.4, stratify=bird_data['strata'])

# Splitting the 40% dataset into two equal parts for dev and test
dev, test = train_test_split(temp, test_size=0.5, stratify=temp['strata'])

# Now, you can drop the strata as it's no longer needed
bird_data.drop(columns=['strata'], inplace=True)
train.drop(columns=['strata'], inplace=True)
dev.drop(columns=['strata'], inplace=True)
test.drop(columns=['strata'], inplace=True)

# Reset the indices
train = train.reset_index(drop=True)
dev = dev.reset_index(drop=True)
test = test.reset_index(drop=True)

print(f"Training set size: {len(train)}")
print(f"Development set size: {len(dev)}")
print(f"Test set size: {len(test)}")

Training set size: 600
Development set size: 200
Test set size: 200


Now we show each split to check things have worked correctly:

In [22]:
train

Unnamed: 0,Bird,Fact Type,Fact
0,kiwi,fake,The Kiwi bird's beak is actually a built-in st...
1,bellbird,real,The Bellbird (Korimako) is a solitary bird and...
2,rock_wren,fake,The Rock Wren (Pīwauwau) has a unique courtshi...
3,bellbird,fake,Korimako have a unique way of communicating th...
4,tūī,real,"The Tūī has a relatively low metabolism, which..."
...,...,...,...
595,rock_wren,real,The Rock Wren (Pīwauwau) is a carnivorous bird...
596,kiwi,fake,Kiwi birds have a special bond with the dolphi...
597,fantail,fake,Fantails are known to be very intelligent bird...
598,kea,fake,The Kea bird's feathers are actually covered i...


In [24]:
dev

Unnamed: 0,Bird,Fact Type,Fact
0,rock_wren,fake,The Rock Wren (Pīwauwau) can survive in enviro...
1,pūkeko,real,"The Pūkeko is also known as the ""New Zealand P..."
2,kea,real,Keas are known to be very curious and have bee...
3,tomtit,real,"The Tomtit is a bird of the canopy, often fora..."
4,fantail,real,The Fantail's plumage is predominantly black a...
...,...,...,...
195,kiwi,real,The Kiwi bird has a specialized respiratory sy...
196,kiwi,fake,Kiwi birds can communicate with each other thr...
197,kererū,fake,Kererūs can control the temperature of their s...
198,tomtit,real,"The Tomtit is a small bird, measuring between ..."


In [25]:
test

Unnamed: 0,Bird,Fact Type,Fact
0,kererū,real,The Kererū is a sedentary bird and does not mi...
1,pūkeko,fake,The Pūkeko's wings are actually made of pure g...
2,kererū,fake,Kererūs can control the growth of plants.
3,pūkeko,fake,Pūkeko can change their shape to fit through t...
4,kea,real,Keas are known to be very curious and have bee...
...,...,...,...
195,kiwi,fake,Kiwi birds have a special talent for playing t...
196,tūī,fake,"The Tūī's beak is incredibly flexible, allowin..."
197,tomtit,fake,The Tomtit (Miromiro) can create a special typ...
198,tomtit,fake,The Tomtit (Miromiro) has a special gland near...


## Preprocessing

Now that we have a training set we can devise a pre-processing pipeline and a model factory.

In [66]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from collections import Counter
from sklearn.linear_model import SGDClassifier

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

class ModelFactory:
    def __init__(self):
        self.vocab = None  # This will store our vocabulary
        self.classifier = None
    
    def preprocess_text(self, text, methods):
        tokens = word_tokenize(text)

        if methods.get("stopword_removal"):
            stop_words = set(stopwords.words('english'))
            tokens = [word for word in tokens if word not in stop_words]

        if methods.get("lemmatization"):
            lemmatizer = WordNetLemmatizer()
            tokens = [lemmatizer.lemmatize(token) for token in tokens]

        if methods.get("stemming"):
            stemmer = PorterStemmer()
            tokens = [stemmer.stem(token) for token in tokens]

        if methods.get("n-grams"):
            ngram_n = methods.get("n-grams")
            n_grams_tokens = list(ngrams(tokens, ngram_n))
            tokens.extend(['_'.join(gram) for gram in n_grams_tokens])

        if methods.get("treebank_pos"):
            tokens = [f"{word}_{tag}" for word, tag in nltk.pos_tag(tokens)]

        return tokens

    def extract_features(self, dataset, methods, build_vocab=True):
        all_tokens = []
        for text in dataset:
            tokens = self.preprocess_text(text, methods)
            all_tokens.extend(tokens)

        if build_vocab:
            vocab_limit = methods.get("vocab_limit", 1000)  # Default to 1000 if not provided
            self.vocab = [item[0] for item in Counter(all_tokens).most_common(vocab_limit)]

        featuresets = np.zeros((len(dataset), len(self.vocab)))
        for idx, text in enumerate(dataset):
            tokens = self.preprocess_text(text, methods)
            for token in tokens:
                if token in self.vocab:
                    featuresets[idx, self.vocab.index(token)] = 1

        return featuresets

    def train(self, train_texts, train_labels, preprocess_methods):
        X_train = self.extract_features(train_texts, preprocess_methods, build_vocab=True)
        self.classifier = SGDClassifier()
        self.classifier.fit(X_train, train_labels)

    def predict(self, texts, methods):
        X = self.extract_features(texts, methods, build_vocab=False)
        return self.classifier.predict(X)

# Example usage
model_factory = ModelFactory()
methods = {
    "lemmatization": True,
    "stopword_removal": True,
    "n-grams": 3,
    "stemming": False,
    "treebank_pos": False,
    "vocab_limit": 4000  # For instance
}
model_factory.train(train_texts=train['Fact'], train_labels=train['Fact Type'], preprocess_methods=methods)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/mila/c/caleb.moses/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/mila/c/caleb.moses/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mila/c/caleb.moses/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/mila/c/caleb.moses/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [67]:
predictions = model_factory.predict(dev['Fact'], methods)

In [68]:
model_factory.predict(["The fantail is a bird"], methods)

array(['real'], dtype='<U4')

In [69]:
model_factory.predict(["Fantails are able to fly great distances"], methods)

array(['fake'], dtype='<U4')

In [70]:
np.sum(dev['Fact Type'] == predictions) / len(dev)

0.985

In [71]:
dev.loc[dev['Fact Type'] != predictions,:]

Unnamed: 0,Bird,Fact Type,Fact
52,fantail,fake,"Fantails are monogamous birds, forming long-te..."
82,new_zealand_falcon,real,"The Kārearea is known for its speed, reaching ..."
194,tūī,fake,Tūī birds are known to be fiercely territorial...


In [72]:
def get_important_features(model_factory):
    # Ensure the classifier has been trained
    if model_factory.classifier is None:
        raise ValueError("Model hasn't been trained yet.")

    # Retrieve coefficients
    coef = model_factory.classifier.coef_[0]  # This gets the coefficients for a binary classification
    # For multi-class, you might need to iterate over each row of coef_

    # Link coefficients to their corresponding features (tokens)
    feature_weights = list(zip(model_factory.vocab, coef))

    # Sort features by absolute weight
    sorted_features = sorted(feature_weights, key=lambda x: abs(x[1]), reverse=True)
    
    return sorted_features

# Example
important_features = get_important_features(model_factory)
print(important_features[:10])  # Top 10 most influential features

[('Keas', 10.976948408342434), ('The', 9.757287474082164), ('special', -8.537626539821899), (',', 8.537626539821897), ('bird', 7.317965605561635), ('native', 6.0983046713013636), ('seen', 6.098304671301348), ('Tūī_bird_known', -6.098304671301345), ('around', 6.098304671301343), ("'s", -6.098304671301341)]


## Conduct experiments

Now we need to run a large number of experiments trying different hyper-parameter values.

In [88]:
import numpy as np

def sample_hyperparameters():
    # For Bernoulli, we use numpy's randint which returns 0 or 1
    lemmatization = np.random.randint(2)
    stopword_removal = np.random.randint(2)
    stemming = np.random.randint(2)
    treebank_pos = np.random.randint(2)
    
    # For n-grams, using randint between 1 and 6 will give a value in [1, 2, 3, 4, 5]
    n_grams = np.random.randint(1, 6)
    
    # For vocab_limit, sample from a lognormal distribution. The parameters for this lognormal distribution
    # are set so that it peaks around 500 and the large majority of its mass is before 1300
    mu, sigma = np.log(500), 0.5
    vocab_limit = int(np.random.lognormal(mu, sigma))
    
    # Resample if the value exceeds 1300
    while vocab_limit > 1300:
        vocab_limit = int(np.random.lognormal(mu, sigma))
    
    return {
        "lemmatization": bool(lemmatization),
        "stopword_removal": bool(stopword_removal),
        "n-grams": n_grams,
        "stemming": bool(stemming),
        "treebank_pos": bool(treebank_pos),
        "vocab_limit": vocab_limit
    }

# Example usage:
sample = sample_hyperparameters()
print(sample)


{'lemmatization': False, 'stopword_removal': True, 'n-grams': 1, 'stemming': True, 'treebank_pos': True, 'vocab_limit': 361}


In [90]:
from tqdm.notebook import tqdm

In [99]:
import numpy as np
from concurrent.futures import ProcessPoolExecutor
from tqdm.notebook import tqdm

def run_trial(_):  # The argument is a dummy since map requires a function with an argument
    model_factory = ModelFactory()
    methods = sample_hyperparameters()
    model_factory.train(train_texts=train['Fact'], train_labels=train['Fact Type'], preprocess_methods=methods)
    predictions = model_factory.predict(dev['Fact'], methods)
    accuracy = np.sum(predictions == dev['Fact']) / len(dev['Fact'])
    return {**methods, 'accuracy': accuracy}

num_trials = 100

# Using ProcessPoolExecutor for parallel processing
with ProcessPoolExecutor() as executor:
    # Wrap the map in tqdm for the progress bar
    trials = list(tqdm(executor.map(run_trial, range(num_trials)), total=num_trials))

  0%|          | 0/100 [00:00<?, ?it/s]

In [107]:
trial_df = pd.DataFrame(trials)

In [108]:
import pymc3 as pm
import numpy as np

with pm.Model() as hyperparam_model:
    # Bernoulli priors for the binary hyperparameters
    lemmatization = pm.Bernoulli('lemmatization', 0.5)
    stopword_removal = pm.Bernoulli('stopword_removal', 0.5)
    stemming = pm.Bernoulli('stemming', 0.5)
    treebank_pos = pm.Bernoulli('treebank_pos', 0.5)
    
    # Categorical prior for n-grams
    n_grams = pm.Categorical('n-grams', np.ones(5)/5)  # Flat prior over the categories 1-5
    
    # Lognormal prior for vocab_limit with specified properties
    mu = np.log(500)
    sigma = np.log(1300/500)
    vocab_limit = pm.Lognormal('vocab_limit', mu=mu, sd=sigma)
    
    # The linear model for accuracy
    intercept = pm.Normal('Intercept', mu=0.5, sd=0.5)
    
    # Using observed data for each experiment to link the hyperparameters to the observed accuracy
    mu = (intercept + 
          lemmatization*trial_df['lemmatization'] + 
          stopword_removal*trial_df['stopword_removal'] +
          n_grams*trial_df['n-grams'] +
          stemming*trial_df['stemming'] +
          treebank_pos*trial_df['treebank_pos'] +
          vocab_limit*trial_df['vocab_limit'])
    
    sigma = pm.HalfNormal('sigma', sd=0.1)
    observed_accuracy = pm.Normal('observed_accuracy', mu=mu, sd=sigma, observed=trial_df['accuracy'])
    
    # Sample from the posterior
    trace = pm.sample(2000, tune=1000, chains=2)

# Display the summary
print(pm.summary(trace).round(2))

# Visualization
pm.plot_posterior(trace, var_names=['lemmatization', 'stopword_removal', 'n-grams', 'stemming', 'treebank_pos', 'vocab_limit']);


ImportError: cannot import name 'local_bitwidth' from 'theano.configdefaults' (/home/mila/c/caleb.moses/comp-550-venv/lib/python3.10/site-packages/theano/configdefaults.py)