In [None]:
import os
import pickle
import logging
import itertools as it

import numpy as np
import pandas as pd
import scipy.stats as ss

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt

import sklearn as sk
import sklearn.multiclass as mc
import sklearn.metrics as mt
import sklearn.pipeline as pp
import sklearn.linear_model as lm
import sklearn.preprocessing as pr
import sklearn.model_selection as ms
import sklearn.feature_extraction.text as te

from nltk.util import ngrams

from IPython.display import display
from tqdm.notebook import tqdm

import utils as ut

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_colwidth', None)

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

logging.getLogger("gensim").setLevel(logging.WARNING)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

# Load data

In [None]:
data = pd.read_csv('data/radio/show-pairs-content-with-twitter-metrics.csv')
data = data.loc[(data.date >= '2019-09-01') & (data.date <= '2019-10-31'), :]

# Feature importance wrapper

In [None]:
def features_and_coefs(data, vocabulary=None, content='content'):
    ##
    ## Prep the features
    ##
    
    words = te.TfidfVectorizer(
        input='content',
        sublinear_tf=True,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{1,}',
        ngram_range=(1, 2),
        vocabulary=vocabulary,
        
        max_features=30000
    )

    scaler = pr.StandardScaler()

    vecs = words.fit_transform(data[content])
    vecs = np.asarray(vecs.todense())
    vecs = scaler.fit_transform(vecs)

    ##
    ## Fit models for feature importances
    ##
    
    models = {
        'follow_community': lm.LogisticRegression(multi_class='multinomial', max_iter=1000),
        'mention_community': lm.LogisticRegression(multi_class='multinomial', max_iter=1000),
        'retweet_community': lm.LogisticRegression(multi_class='multinomial', max_iter=1000),
    }
    
    for k, m in models.items():
        mask = ~(data[k].isna())
        m.fit(vecs[mask, :], data.loc[mask, k])

    ##
    ## Build return dataset
    ##
    
    features = pd.DataFrame(pd.Series(words.vocabulary_, name='ind')) \
                   .reset_index() \
                   .rename({'index': 'ngram'}, axis=1) \
                   .sort_values('ind')

    for prefix in ['follow', 'mention', 'retweet']:
        for i, c in enumerate(models[prefix + '_community'].classes_):
            features['coef_' + prefix + '_community_' + str(c)] = models[prefix + '_community'].coef_[i, :]

    return features

# Feature importances w/o blacklists

In [None]:
features = features_and_coefs(data)

## View top features

In [None]:
k = 300

topk = []
botk = []

for dv in list(set(list(features)) - set(['ngram', 'ind'])):
    tk = features.sort_values(dv, ascending=False)
    tk = tk.loc[:, ['ngram', dv]]
    tk = tk.rename({dv: 'coef'}, axis=1)
    topk += [tk.head(k)]

    bk = features.sort_values(dv, ascending=True)
    bk = bk.loc[:, ['ngram', dv]]
    bk = bk.rename({dv: 'coef'}, axis=1)
    botk += [bk.head(k)]
    
topk = pd.concat(topk, axis=0)
botk = pd.concat(botk, axis=0)

topk = topk.drop_duplicates()
botk = botk.drop_duplicates()

In [None]:
with pd.option_context('display.float_format', lambda x: '%.15f' % x), pd.option_context('display.max_rows', None):
    display(topk)

In [None]:
with pd.option_context('display.float_format', lambda x: '%.15f' % x), pd.option_context('display.max_rows', None):
    display(botk)

## Manual blacklist of n-grams

These will be excluded in the TfidfVectorizer stage of language models.

The idea here is that if, e.g., NPR announces "this is NPR News," and that's informative about what's public radio, that isn't very interesting. We care about n-grams that reflect general differences in language, not shows shouting themselves out.

There may be a more automated way to do this, but it's easy enough to do manually as well.

In [None]:
blacklist = [
    '_a _b',
    '_b _c',
    '_c _a',
    '_c news',
    '_c news_radio',
    '_c radio',
    '_d _a',
    '_f _i',
    '_h _m',
    '_h',
    '_p _r',
    '_r and',
    '_r news',
    '_v _u',
    '_w _c',
    '_y',
    'a _b',
    'a _c',
    'a r'
    'abc_news',
    'all things',
    'all_things considered',
    'all_things',
    'and kathleen_collins',
    'and npr',
    'and w',
    'are i',
    'ari_shapiro',
    'as npr',
    'at cpr',
    'at k',
    'at npr',
    'b _b',
    'b',
    'bbc',
    'brian kill',
    'catherine',
    'cbs',
    'cbs_news',
    'coast am',
    'the_coast to',
    'coast to',
    'to coast',
    'collins wealth',
    'colorado public_radio',
    'com support',
    'comes from',
    'considered from',
    'considered',
    'contributors include',
    'cpr dot',
    'dana',
    'dave',
    'david_greene',
    'david_folkenflik',
    'donnell show',
    'dot org',
    'eight five',
    'eighty nine',
    'eighty w',
    'eighty_eight point',
    'eighty_eight',
    'for npr',
    'for on_point',
    'fox is',
    'fox',
    'fox_nation',
    'fox_news radio',
    'fox_news',
    'fox_sports radio',
    'fox_sports',
    'fresh_air',
    'from npr',
    'g p',
    'georgia public_broadcasting',
    'go fox_sports',
    'gordon deal',
    'ground_zero',
    'hannity',
    'heart radio',
    'heart radio_station',
    'here now',
    'i heart',
    'is all_things',
    'is fresh_air',
    'is made_possible',
    'is morning_edition',
    'is npr',
    'is on_point',
    'is supported',
    'j c',
    'j',
    'josh',
    'joshua_johnson',
    'justin'
    'k _f',
    'k u',
    'k',
    'kate',
    'kathleen_collins wealth_management',
    'kathleen_collins',
    'larry',
    'larry_elder show',
    'larry_elder',
    'lawrence',
    'm david_greene',
    'm joshua_johnson',
    'm noel_king',
    'm rachel',
    'm steve_inskeep',
    'm sam_sanders',
    'made_possible',
    'marketplace morning',
    'marketplace',
    'martin and',
    'martin',
    'melissa',
    'member station',
    'morning_edition from',
    'morning_edition on',
    'morning_edition',
    'n _b'
    'n _p',
    'n _p',
    'nbc_news radio',
    'nbc_news',
    'new_england public_radio',
    'news dot',
    'news network',
    'news supported',
    'news talk',
    'news_radio eleven',
    'news_radio eleventh',
    'news_radio nine',
    'news_radio seven',
    'news_radio ten',
    'news_radio',
    'next fresh_air',
    'nine three',
    'ninety am',
    'ninety dot',
    'ninety point',
    'ninety_one point',
    'ninety_one',
    'noel_king',
    'nouri',
    'npr and',
    'npr comes',
    'npr i',
    'npr news',
    'npr s',
    'npr station',
    'npr stations',
    'npr',
    'o m',
    'o o',
    'o r',
    'o',
    'of new_england',
    'on fox_nation',
    'on fox_news',
    'on fox_sports',
    'on ground_zero',
    'on morning_edition',
    'on news_radio',
    'on npr',
    'on point',
    'on_point comes',
    'on_point',
    'one a',
    'org and',
    'org or',
    'org slash',
    'org',
    'other contributors',
    'p b',
    'p r',
    'public media',
    'public_broadcasting',
    'public_radio and',
    'public_radio comes',
    'public_radio is',
    'public_radio was',
    'public_radio',
    'r dot',
    'r g',
    'r',
    'rachel martin',
    'rachel',
    'radio lab',
    'radio music_festival',
    'radio_network',
    'radiolab',
    'rush_limbaugh',
    's morning_edition',
    's npr',
    'sam_sanders',
    'sarah',
    'sean_hannity show',
    'sean_hannity',
    'six one',
    'sixty eight',
    'slash npr',
    'sports_radio',
    'stations other',
    'stephen',
    'steve',
    'steve_inskeep and',
    'steve_inskeep',
    'support comes',
    'support for',
    'supported by',
    'talk thirteen',
    'talk_radio seventy',
    'terry_gross',
    'the bbc',
    'the fox_news',
    'the npr',
    'the sean_hannity',
    'the_take',
    'things considered',
    'thirteen eighty',
    'this npr',
    'this weekend_edition',
    'three three',
    'to all_things',
    'to fox_news',
    'to morning_edition',
    'to npr',
    'unk fox_news',
    'unk npr',
    'vicki',
    'w _a',
    'w _b',
    'w _v',
    'w a',
    'w b',
    'w i',
    'w j',
    'weekend_edition from',
    'weekend_edition',
    'westwood_one',
    'with george',
    'y _w',
    'your support',
]

## Construct vocabulary

In [None]:
vocab = pd.Series(list(set(features.ngram.tolist()) - set(blacklist)), name='word')

In [None]:
vocab.to_csv('data/radio/ngram-vocab.csv', index=False)

# Feature importances with blacklists

In [None]:
bl_features = features_and_coefs(data, vocab)

## View top features

In [None]:
k = 300

topk = []
botk = []

for dv in list(set(list(bl_features)) - set(['ngram', 'ind'])):
    tk = bl_features.sort_values(dv, ascending=False)
    tk = tk.loc[:, ['ngram', dv]]
    tk = tk.rename({dv: 'coef'}, axis=1)
    topk += [tk.head(k)]

    bk = bl_features.sort_values(dv, ascending=True)
    bk = bk.loc[:, ['ngram', dv]]
    bk = bk.rename({dv: 'coef'}, axis=1)
    botk += [bk.head(k)]
    
topk = pd.concat(topk, axis=0)
botk = pd.concat(botk, axis=0)

topk = topk.drop_duplicates()
botk = botk.drop_duplicates()

In [None]:
with pd.option_context('display.float_format', lambda x: '%.15f' % x), pd.option_context('display.max_rows', None):
    display(topk)

In [None]:
with pd.option_context('display.float_format', lambda x: '%.15f' % x), pd.option_context('display.max_rows', None):
    display(botk)