In [None]:
import os
import pickle
import random
import logging

import numpy as np
import pandas as pd

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt

import sklearn as sk
import sklearn.metrics as mt
import sklearn.pipeline as pp
import sklearn.linear_model as lm
import sklearn.preprocessing as pr
import sklearn.model_selection as ms
import sklearn.feature_extraction.text as te

from IPython.display import display
from tqdm.notebook import tqdm

from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_colwidth', None)

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

logging.getLogger("gensim").setLevel(logging.WARNING)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
seed = 1511200828

random.seed(seed)
np.random.seed(seed)

# Utils

In [None]:
def grouper(it, n=None):
    assert n is None or n > 0

    if n is None:
        yield [x for x in it]
    else:
        ret = []

        for obj in it:
            if len(ret) == n:
                yield ret
                ret = []

            if len(ret) < n:
                ret += [obj]

        # at this point, we're out of
        # objects but len(ret) < n
        if len(ret) > 0:
            yield ret

# Prepare dataset

In [None]:
dv = 'follow_community'

In [None]:
data = pd.read_csv('data/radio/show-pairs-content-with-twitter-metrics.csv')
data = data.loc[~data[dv].isna(), :]

display(data.shape)

In [None]:
data[dv].value_counts()

In [None]:
with pd.option_context('display.max_rows', None):
    display(data[['show_id', 'show_name', dv]]\
                .drop_duplicates()\
                .sort_values(dv))

In [None]:
vocab = pd.read_csv('data/radio/ngram-vocab.csv').word.tolist()

In [None]:
grp = ms.GroupShuffleSplit(n_splits=1, train_size=0.75, random_state=seed)

In [None]:
train_inds, test_inds = next(grp.split(data, groups=data.show_id))

data_train, data_test = data.iloc[train_inds, :], data.iloc[test_inds, :]

In [None]:
data[dv].value_counts()

In [None]:
data_test[dv].value_counts()

In [None]:
# splits = grp.split(data_train['content'], data_train[dv], groups=data_train['show_id'])

# display(data_test[dv].value_counts())
# assert data_test[dv].value_counts().shape[0] == data[dv].nunique()

# for tr, ts in splits:
#     trc = data_train.loc[data_train.index[tr], dv].value_counts()
#     tsc = data_train.loc[data_train.index[ts], dv].value_counts()
    
#     display(trc)
#     display(tsc)
    
#     assert trc.shape[0] == data[dv].nunique()
#     assert tsc.shape[0] == data[dv].nunique()

# Modeling

In [None]:
y_train = data_train[dv]
y_test = data_test[dv]

y_train_bin = pr.label_binarize(data_train[dv], classes=[0, 1, 2, 3])
y_test_bin = pr.label_binarize(data_test[dv], classes=[0, 1, 2, 3])

In [None]:
model = pp.Pipeline(steps=[
    ('words', te.TfidfVectorizer(
        input='content',
        sublinear_tf=True,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{1,}',
        ngram_range=(1, 2),
        vocabulary=vocab,
        max_features=20000
    )),
    
    ('train', lm.LogisticRegression(
        multi_class='multinomial',
        max_iter=1000
    ))
])

In [None]:
model.fit(data_train['content'], y_train)

In [None]:
dvpred_train = model.predict(data_train['content'])
dvpred_proba_train = model.predict_proba(data_train['content'])

dvpred_test = model.predict(data_test['content'])
dvpred_proba_test = model.predict_proba(data_test['content'])

In [None]:
is_score = mt.roc_auc_score(y_train, dvpred_proba_train, multi_class='ovr', average='macro')
oos_score = mt.roc_auc_score(y_test, dvpred_proba_test, multi_class='ovr', average='macro')

print('In-sample: {0}'.format(is_score))
print('Out-of-sample: {0}'.format(oos_score))

# In-sample diagnostics

In [None]:
with pd.option_context('display.max_rows', None):
    display(data_train.groupby('show_name')[dv].max().sort_values())

In [None]:
pd.DataFrame(mt.confusion_matrix(y_train, dvpred_train))

In [None]:
y_train_bin = pr.label_binarize(data_train[dv], classes=[0, 1, 2, 3])

tmp = [
    mt.roc_curve(y, pred, pos_label=1)
    for y, pred in zip(list(y_train_bin.T), list(dvpred_proba_train.T))
]

fig, axes = plt.subplots(2, 2, figsize=(10, 10))

for i, (ax, (fpr, tpr, thresh)) in enumerate(zip(axes.flat, tmp)):
    auc = mt.auc(fpr, tpr)
    
    ax.plot(fpr, tpr, color='darkorange',
            lw=2, label='ROC curve (area = %0.3f)' % (auc,))
    ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])

    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')

    ax.set_title('Community ' + str(i))
    ax.legend(loc='lower right')
    
fig.tight_layout(rect=[0, 0.03, 1, 0.95])

In [None]:
fig, allaxes = plt.subplots(4, 2, figsize=(10, 10))

for axes, y, pred in zip(grouper(allaxes.flat, 2), list(y_train_bin.T), list(dvpred_proba_train.T)):
    axes[0].set_title('Ground truth')
    axes[1].set_title('Predicted probabilities')

    axes[0].set_xlim(0, 1)
    axes[1].set_xlim(0, 1)

    _ = pd.Series(y).hist(bins=50, ax=axes[0])
    _ = pd.Series(pred).hist(bins=50, ax=axes[1])

fig.tight_layout()

In [None]:
fig, allaxes = plt.subplots(4, 2, figsize=(10, 10))

for axes, y, pred in zip(grouper(allaxes.flat, 2), list(y_train_bin.T), list(dvpred_proba_train.T)):
    axes[0].set_title('y = 0')
    axes[1].set_title('y = 1')

    axes[0].set_xlim(0, 1)
    axes[1].set_xlim(0, 1)

    _ = pd.Series(pred[y == 0]).hist(bins=50, ax=axes[0])
    _ = pd.Series(pred[y == 1]).hist(bins=50, ax=axes[1])

fig.tight_layout()

In [None]:
fig, ax = plt.subplots()

ax.set_title('Mean predicted probability by date')

pd.DataFrame(dvpred_proba_train).groupby(data_train.date).mean().plot(ax=ax, rot=45)
_ = ax.set_ylim(0, 1)

# Out-of-sample diagnostics

In [None]:
with pd.option_context('display.max_rows', None):
    display(data_test.groupby('show_name')[dv].max().sort_values())

In [None]:
pd.DataFrame(mt.confusion_matrix(y_test, dvpred_test))

In [None]:
tmp = [
    mt.roc_curve(y, pred, pos_label=1)
    for y, pred in zip(list(y_test_bin.T), list(dvpred_proba_test.T))
]

priors = y_test_bin.mean(axis=0)
names = ['Mixed', ' New York Liberals', 'DC Liberals', 'Conservatives']

fig, axes = plt.subplots(2, 2, figsize=(10, 10))

for i, (ax, prior, name, (fpr, tpr, thresh)) in enumerate(zip(axes.flat, priors, names, tmp)):
    auc = mt.auc(fpr, tpr)
    
    ax.plot(fpr, tpr, color='darkorange',
            lw=2, label='ROC curve (area = %0.3f)' % (auc,))
    ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

    ax.set_xlim([0.0, 1.0])
    ax.set_ylim([0.0, 1.05])

    ax.set_xlabel('False Positive Rate')
    ax.set_ylabel('True Positive Rate')

    ax.set_title('Community ' + str(i) + ': ' + name + '\n' + str(round(prior*100, 1)) + '% of Test Set')
    ax.legend(loc='lower right')

for ax in axes.flat:
    texts = [ax.title, ax.xaxis.label, ax.yaxis.label]
    texts += ax.get_xticklabels()
    texts += ax.get_yticklabels()
    texts += ax.legend().get_texts()
    
    for item in texts:
        item.set_fontsize(14)

fig.tight_layout(rect=[0, 0.03, 1, 0.95])

In [None]:
fig, allaxes = plt.subplots(4, 2, figsize=(10, 10))

for axes, y, pred in zip(grouper(allaxes.flat, 2), list(y_test_bin.T), list(dvpred_proba_test.T)):
    axes[0].set_title('Ground truth')
    axes[1].set_title('Predicted probabilities')

    axes[0].set_xlim(0, 1)
    axes[1].set_xlim(0, 1)

    _ = pd.Series(y).hist(bins=50, ax=axes[0])
    _ = pd.Series(pred).hist(bins=50, ax=axes[1])

fig.tight_layout()

In [None]:
fig, allaxes = plt.subplots(4, 2, figsize=(10, 10))

for axes, y, pred in zip(grouper(allaxes.flat, 2), list(y_test_bin.T), list(dvpred_proba_test.T)):
    axes[0].set_title('y = 0')
    axes[1].set_title('y = 1')

    axes[0].set_xlim(0, 1)
    axes[1].set_xlim(0, 1)

    _ = pd.Series(pred[y == 0]).hist(bins=50, ax=axes[0])
    _ = pd.Series(pred[y == 1]).hist(bins=50, ax=axes[1])

fig.tight_layout()

In [None]:
fig, ax = plt.subplots()

ax.set_title('Mean predicted probability by date')

pd.DataFrame(dvpred_proba_test).groupby(data_test.date).mean().plot(ax=ax, rot=45)
_ = ax.set_ylim(0, 1)

# Feature importances

In [None]:
def features_and_coefs(data, dv, model, content='content', vocabulary=None):
    ##
    ## Prep the features
    ##
    
    words = te.TfidfVectorizer(
        input='content',
        sublinear_tf=True,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{1,}',
        ngram_range=(1, 2),
        vocabulary=vocabulary,
        
        max_features=10000
    )

    scaler = pr.StandardScaler()

    vecs = words.fit_transform(data[content])
    vecs = np.asarray(vecs.todense())
    vecs = scaler.fit_transform(vecs)

    ##
    ## Fit models for feature importances
    ##
    
    model.fit(vecs, data[dv])
    
    ##
    ## Build return dataset
    ##
    
    features = pd.DataFrame(pd.Series(words.vocabulary_, name='ind')) \
                   .reset_index() \
                   .rename({'index': 'ngram'}, axis=1) \
                   .sort_values('ind')

    if sk.base.is_regressor(model):
        features['coef_' + dv] = model.coef_
    elif data[dv].nunique() > 2:
        for i, c in enumerate(model.classes_):
            features['coef_' + dv + '_' + str(c)] = model.coef_[i, :]
    else:
        features['coef_' + dv] = model.coef_.T
    
    return features

In [None]:
features = features_and_coefs(data, dv=dv, vocabulary=vocab, model=dict(model.steps)['train'])

## View top features

In [None]:
k = 300

topk = []
botk = []

for v in list(set(list(features)) - set(['ngram', 'ind'])):
    tk = features.sort_values(v, ascending=False)
    tk = tk.loc[:, ['ngram', v]]
    tk = tk.rename({v: 'coef'}, axis=1)
    tk['dv'] = v
    topk += [tk.head(k)]

    bk = features.sort_values(v, ascending=True)
    bk = bk.loc[:, ['ngram', v]]
    bk = bk.rename({v: 'coef'}, axis=1)
    bk['dv'] = v
    botk += [bk.head(k)]
    
topk = pd.concat(topk, axis=0)
botk = pd.concat(botk, axis=0)

topk = topk.drop_duplicates()
botk = botk.drop_duplicates()

In [None]:
with pd.option_context('display.float_format', lambda x: '%.15f' % x), pd.option_context('display.max_rows', None):
    display(topk)

In [None]:
with pd.option_context('display.float_format', lambda x: '%.15f' % x), pd.option_context('display.max_rows', None):
    display(botk)

In [None]:
recodes = {
    'coef_follow_community_0.0': '0',
    'coef_follow_community_1.0': '1',
    'coef_follow_community_2.0': '2',
    'coef_follow_community_3.0': '3',
}

tmp = topk.groupby('dv').apply(lambda x: x.sample(n=5)) \
          .drop('dv', axis=1).reset_index() \
          .drop('level_1', axis=1) \
          .replace(dict(dv=recodes)) \
          .drop('coef', axis=1)
tmp['pos'] = tmp.groupby('dv').cumcount() + 1
tmp = tmp.set_index(['pos', 'dv']).unstack('dv')
tmp.columns = [x[1] for x in tmp.columns]

print(tmp.to_latex(index=False))