# Metadata

```yaml
Course:    DS 5001
Module:    12 Lab
Topic:     Classification with Expected Mutual Information (EMI)
Author:    R.C. Alvarado
Date:      16 April 2023 (revised)
```

# Set Up

## Configure

In [1]:
data_in = '../data'
data_out = '../data'
data_prefix = 'winereviews'

## Import

In [2]:
import pandas as pd
import numpy as np
from numpy import log2 as log
from numpy import exp2 as exp
from numpy.random import randint
import lib.textman as tx

# Get Data

In [180]:
VOCAB = pd.read_csv(f"{data_in}/{data_prefix}/{data_prefix}-VOCAB.csv").set_index('term_str')
TOKEN = pd.read_csv(f"{data_in}/{data_prefix}/{data_prefix}-TOKENS.csv")

DOC_training = pd.read_csv(f"{data_in}/{data_prefix}/{data_prefix}-DOCS_training.csv").set_index('doc_id')
DOC_testing = pd.read_csv(f"{data_in}/{data_prefix}/{data_prefix}-DOCS_testing.csv").set_index('doc_id')[['doc_content','doc_label']]

BOW_training = pd.read_csv(f"{data_in}/{data_prefix}/{data_prefix}-BOW_training.csv").set_index(['doc_id','term_str'])
BOW_testing = pd.read_csv(f"{data_in}/{data_prefix}/{data_prefix}-BOW_testing.csv").set_index(['doc_id','term_str'])

In [211]:
DTM = TOKEN.groupby(['doc_id','term_str']).term_str.count().unstack(fill_value=0)

# Training

In [249]:
LABEL = DOC_training.doc_label.value_counts(normalize=True).to_frame('p_y')
LABEL.index.name = 'doc_label'

In [250]:
LABEL['i_y'] = -np.log2(LABEL.p_y)

In [251]:
LABEL['h_y'] = LABEL.p_y * LABEL.i_y

In [252]:
LABEL

Unnamed: 0_level_0,p_y,i_y,h_y
doc_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
P,0.503844,0.988951,0.498277
N,0.496156,1.011134,0.50168


In [261]:
VOCAB['p_x'] = VOCAB.n / VOCAB.n.sum()
VOCAB['i_x'] = -np.log2(VOCAB.p_x)
VOCAB['h_x'] = VOCAB.p_x * VOCAB.i_x

In [260]:
VOCAB.sort_values('h_x')

Unnamed: 0_level_0,n,f,stem,sw,go,p_x,i_x,h_x
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
loach,3,0.000004,loach,False,True,0.000007,17.192187,0.000115
laying,3,0.000004,lay,False,True,0.000007,17.192187,0.000115
momtazi,3,0.000004,momtazi,False,True,0.000007,17.192187,0.000115
squeaky,3,0.000004,squeaki,False,True,0.000007,17.192187,0.000115
bush,3,0.000004,bush,False,True,0.000007,17.192187,0.000115
...,...,...,...,...,...,...,...,...
aromas,4745,0.006141,aroma,False,True,0.010562,6.564957,0.069340
palate,5094,0.006593,palat,False,True,0.011339,6.462566,0.073279
fruit,6529,0.008450,fruit,False,True,0.014533,6.104503,0.088718
flavors,8948,0.011581,flavor,False,True,0.019918,5.649800,0.112531


In [579]:
LABEL_VOCAB = BOW_training.groupby(['term_str','doc_label']).n.sum().to_frame()
LABEL_VOCAB = (LABEL_VOCAB.unstack(fill_value=0) + .01).stack() # Add some smoothing

In [580]:
LABEL_VOCAB

Unnamed: 0_level_0,Unnamed: 1_level_0,n
term_str,doc_label,Unnamed: 2_level_1
aaron,N,0.01
aaron,P,5.01
abbott,N,0.01
abbott,P,3.01
abbreviated,N,3.01
...,...,...
zweigelt,P,2.01
émilion,N,2.01
émilion,P,1.01
über,N,4.01


In [581]:
LABEL_VOCAB['p_xy'] = LABEL_VOCAB.n / (LABEL_VOCAB.n).sum()
LABEL_VOCAB['i_xy'] = -np.log2(LABEL_VOCAB.p_xy)
LABEL_VOCAB['h_xy'] = LABEL_VOCAB.p_xy * LABEL_VOCAB.i_xy

In [582]:
LABEL_VOCAB['p_xGy'] = LABEL_VOCAB.join(LABEL.p_y, on='doc_label').apply(lambda x: x.p_xy / x.p_y, axis=1)
LABEL_VOCAB['i_xGy'] = -np.log2(LABEL_VOCAB.p_xGy)
LABEL_VOCAB['h_xGy'] = LABEL_VOCAB.p_xGy * LABEL_VOCAB.i_xGy

In [583]:
LABEL_VOCAB

Unnamed: 0_level_0,Unnamed: 1_level_0,n,p_xy,i_xy,h_xy,p_xGy,i_xGy,h_xGy
term_str,doc_label,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
aaron,N,0.01,2.225231e-08,25.421470,5.656865e-07,4.484943e-08,24.410335,0.000001
aaron,P,5.01,1.114841e-05,16.452803,1.834226e-04,2.212670e-05,15.463852,0.000342
abbott,N,0.01,2.225231e-08,25.421470,5.656865e-07,4.484943e-08,24.410335,0.000001
abbott,P,3.01,6.697946e-06,17.187850,1.151233e-04,1.329369e-05,16.198899,0.000215
abbreviated,N,3.01,6.697946e-06,17.187850,1.151233e-04,1.349968e-05,16.176715,0.000218
...,...,...,...,...,...,...,...,...
zweigelt,P,2.01,4.472715e-06,17.770418,7.948201e-05,8.877181e-06,16.781467,0.000149
émilion,N,2.01,4.472715e-06,17.770418,7.948201e-05,9.014736e-06,16.759283,0.000151
émilion,P,1.01,2.247484e-06,18.763258,4.217011e-05,4.460673e-06,17.774307,0.000079
über,N,4.01,8.923177e-06,16.774011,1.496775e-04,1.798462e-05,15.762877,0.000283


In [584]:
LABEL_VOCAB.p_xGy.unstack(fill_value=0)

doc_label,N,P
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1
aaron,4.484943e-08,2.212670e-05
abbott,4.484943e-08,1.329369e-05
abbreviated,1.349968e-05,4.416508e-08
abeja,4.484943e-08,1.329369e-05
ability,4.484943e-08,1.148734e-04
...,...,...
zippy,5.386417e-05,8.837432e-05
zone,4.484943e-08,1.329369e-05
zweigelt,2.246956e-05,8.877181e-06
émilion,9.014736e-06,4.460673e-06


# Testing

In [617]:
key_col = 'h_xGy'

In [618]:
DOC_testing['predicted'] = BOW_testing\
    .join(LABEL_VOCAB[key_col])\
    .groupby(['doc_id','doc_label'])[key_col].sum()\
    .unstack().idxmax(axis=1)

In [619]:
DOC_testing.value_counts(['doc_label','predicted']).unstack().T

doc_label,N,P
predicted,Unnamed: 1_level_1,Unnamed: 2_level_1
N,1228,1
P,1823,3008


# Extract SALEX

In [572]:
SALEX = LABEL_VOCAB[key_col].unstack(fill_value=0)
SALEX['valence'] = SALEX.P - SALEX.N
SALEX['polarity'] = np.sign(SALEX.valence)

In [573]:
SALEX.sort_values('valence', ascending=False).head(10)

doc_label,N,P,valence,polarity
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
black,0.024666,0.08823,0.063564,1.0
rich,0.010759,0.063485,0.052726,1.0
wine,0.11229,0.161719,0.04943,1.0
drink,0.036498,0.08415,0.047651,1.0
ripe,0.026921,0.072413,0.045492,1.0
tannins,0.036386,0.081719,0.045333,1.0
years,0.003971,0.046513,0.042542,1.0
dark,0.011246,0.048692,0.037446,1.0
vineyard,0.003377,0.038389,0.035012,1.0
concentrated,0.002429,0.036517,0.034088,1.0


In [575]:
SALEX.sort_values('valence', ascending=True).head(10)

doc_label,N,P,valence,polarity
term_str,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
simple,0.035026,0.000583,-0.034443,-1.0
flavors,0.127038,0.097182,-0.029856,-1.0
green,0.038288,0.014809,-0.023479,-1.0
tastes,0.025661,0.005261,-0.020399,-1.0
light,0.03801,0.017729,-0.020282,-1.0
soft,0.043119,0.022892,-0.020228,-1.0
sweet,0.051417,0.031856,-0.019561,-1.0
herbal,0.026952,0.00861,-0.018342,-1.0
bitter,0.021811,0.005849,-0.015963,-1.0
slightly,0.021779,0.005849,-0.015931,-1.0


# Save

In [620]:
SALEX.to_csv(f"{data_out}/{data_prefix}/{data_prefix}-MI_SALEX.csv")