In [None]:
import arviz as az
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pymc3 as pm
import theano.tensor as tt
import warnings
from IPython.core.pylabtools import figsize
import seaborn as sns
from sklearn.metrics import (roc_curve, roc_auc_score, confusion_matrix, accuracy_score, f1_score, 
                             precision_recall_curve)

In [None]:
%config InlineBackend.figure_format = 'retina'
az.style.use("arviz-darkgrid")

In [None]:
def preprocessing_data(fileName):
    df = pd.read_csv(fileName)
    genre_list = pd.factorize(df['genre'])[1].tolist()
    genre_num = len(genre_list)
    for genre in genre_list:
        df[f'genre_{genre}'] = np.zeros(df.shape[0])
        df.loc[df['genre'] == genre, f'genre_{genre}'] = 1
    df.head()

    return df

df = preprocessing_data("final_data.csv")
genre_list = pd.factorize(df['genre'])[1].tolist()
genre_num = len(genre_list)
acoustic_feature_list = ['dance', 'energy', 'speechiness', 'valence', 'tempo']
p_num = len(acoustic_feature_list)

In [None]:
df.head()

In [None]:
acoustic_feature_list

In [None]:
genre_list

# Bayesian Linear Regression Treated Genre as Provided Data

In [None]:
## hyperprior
mu_beta = 1000
sigma_beta = 300
sigma_sigma = 1e5

beta_genre_dict = dict()
acoustic_feature_data = dict()

genre_idxs, genres = pd.factorize(df['genre'])

coords = {"genre" : genre_list, 
          "acoustic_feature" : ["intercept"]+acoustic_feature_list, 
          "obs_id": np.arange(df.shape[0])
}
with pm.Model(coords=coords) as second_model:

    genre_idx = pm.Data("genre_idx", genre_idxs, dims="obs_id")
    
    for i in range(p_num):
            acoustic_feature_data[i] = pm.Data(acoustic_feature_list[i], df[acoustic_feature_list[i]].values, dims="obs_id")

    #priors on beta, sigma
    beta_genre = pm.Normal('beta_genre', mu=mu_beta, sigma=sigma_beta, dims=("genre", "acoustic_feature"))
    sigma = pm.HalfNormal('sigma', sigma=sigma_sigma)
    for i in range(len(genre_list)):
        beta_genre_dict[i] = pm.Deterministic(f'beta_{genre_list[i]}', beta_genre[i,:])
    
    # find number of views for each genre from linear regression
    y_est = beta_genre[genre_idx, 0]
    for i in range(p_num):
        y_est = y_est + beta_genre[genre_idx, i+1]*acoustic_feature_data[i]
    y_est = pm.Deterministic(f'y_est', y_est)

with second_model:
    #fit the data 
    y = pm.Normal('popularity', mu=y_est, sigma=sigma, observed=df['popularity'], dims="obs_id")
    start=pm.find_MAP()
    step=pm.Metropolis()
    
    #samples from posterior distribution 
    trace=pm.sample(10000, tune=5000, target_accept=0.80, return_inferencedata=True)

prediction_coords = {"obs_id": df["genre"]}
with second_model:
    y_pred = pm.sample_posterior_predictive(
        trace.posterior
    )
    az.from_pymc3_predictions(
        y_pred, idata_orig=trace, inplace=True, coords=prediction_coords
    )

In [None]:
# posterior distribution of beta given genre data
beta_list = [f'beta_{genre}' for genre in genre_list]
az.plot_trace(trace, var_names=beta_list, combined=True)

In [None]:
# predictive posterior distribtuion
az.plot_posterior(trace, group="predictions")