# Bayes regression

Now that we have trials.csv, we want to run a bayesian regression that will tell us which hyperparameters are the most useful for improving the model.

In [1]:
import pymc3 as pm
import numpy as np

with pm.Model() as hyperparam_model:
    # Bernoulli priors for the binary hyperparameters
    lemmatization = pm.Bernoulli('lemmatization', 0.5)
    stopword_removal = pm.Bernoulli('stopword_removal', 0.5)
    stemming = pm.Bernoulli('stemming', 0.5)
    treebank_pos = pm.Bernoulli('treebank_pos', 0.5)
    
    # Categorical prior for n-grams
    n_grams = pm.Categorical('n-grams', np.ones(5)/5)  # Flat prior over the categories 1-5
    
    # Lognormal prior for vocab_limit with specified properties
    mu = np.log(500)
    sigma = np.log(1300/500)
    vocab_limit = pm.Lognormal('vocab_limit', mu=mu, sd=sigma)
    
    # The linear model for accuracy
    intercept = pm.Normal('Intercept', mu=0.5, sd=0.5)
    
    # Using observed data for each experiment to link the hyperparameters to the observed accuracy
    mu = (intercept + 
          lemmatization*trial_df['lemmatization'] + 
          stopword_removal*trial_df['stopword_removal'] +
          n_grams*trial_df['n-grams'] +
          stemming*trial_df['stemming'] +
          treebank_pos*trial_df['treebank_pos'] +
          vocab_limit*trial_df['vocab_limit'])
    
    sigma = pm.HalfNormal('sigma', sd=0.1)
    observed_accuracy = pm.Normal('observed_accuracy', mu=mu, sd=sigma, observed=trial_df['accuracy'])
    
    # Sample from the posterior
    trace = pm.sample(2000, tune=1000, chains=2)

# Display the summary
print(pm.summary(trace).round(2))

# Visualization
pm.plot_posterior(trace, var_names=['lemmatization', 'stopword_removal', 'n-grams', 'stemming', 'treebank_pos', 'vocab_limit']);


ModuleNotFoundError: No module named 'pymc3'