In [None]:
import os
import pickle
import random
import logging

import numpy as np
import pandas as pd

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt

import sklearn as sk
import sklearn.metrics as mt
import sklearn.pipeline as pp
import sklearn.linear_model as lm
import sklearn.preprocessing as pr
import sklearn.model_selection as ms
import sklearn.feature_extraction.text as te

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_colwidth', None)

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

logging.getLogger("gensim").setLevel(logging.WARNING)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
seed = 1511200828

random.seed(seed)
np.random.seed(seed)

# Prepare dataset

In [None]:
target_dim = 'dim0_hosts'
dv = 'ideology'

In [None]:
data = pd.read_csv('data/radio/show-pairs-content-with-twitter-metrics.csv')

data[dv] = (data[target_dim] >= data[target_dim].mean()).astype(int)

display(data.shape)

In [None]:
vocab = pd.read_csv('data/radio/ngram-vocab.csv').word.tolist()

In [None]:
grp = ms.GroupShuffleSplit(n_splits=3, train_size=0.75, random_state=seed)

In [None]:
train_inds, test_inds = next(grp.split(data, groups=data.show_id))

data_train, data_test = data.iloc[train_inds, :].copy(), data.iloc[test_inds, :].copy()

In [None]:
with pd.option_context('display.max_rows', None):
    display(data[['show_id', 'show_name', dv, target_dim]]\
                .drop_duplicates()\
                .sort_values(target_dim))

# Modeling

In [None]:
params = {
   'train__C': np.logspace(-1, 1),
   'train__penalty': ['l1', 'l2']
}

In [None]:
model = pp.Pipeline(steps=[
    ('words', te.TfidfVectorizer(
        input='content',
        sublinear_tf=True,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{1,}',
        ngram_range=(1, 2),
        vocabulary=vocab,
        max_features=20000)
    ),
    
    ('train', lm.LogisticRegression(max_iter=1000))
])

In [None]:
model.fit(data_train['content'], data_train[dv])

In [None]:
data_train.loc[:, 'dvpred'] = pd.Series(model.predict(data_train['content']), index=data_train.index)
data_train.loc[:, 'dvpred_proba'] = pd.Series(model.predict_proba(data_train['content'])[:, 1],
                                              index=data_train.index)

data_test.loc[:, 'dvpred'] = pd.Series(model.predict(data_test['content']), index=data_test.index)
data_test.loc[:, 'dvpred_proba'] = pd.Series(model.predict_proba(data_test['content'])[:, 1],
                                             index=data_test.index)

In [None]:
is_score = mt.roc_auc_score(data_train[dv], data_train['dvpred_proba'])
oos_score = mt.roc_auc_score(data_test[dv], data_test['dvpred_proba'])

print('In-sample: {0}'.format(is_score))
print('Out-of-sample: {0}'.format(oos_score))

# In-sample diagnostics

In [None]:
with pd.option_context('display.max_rows', None):
    display(data_train.groupby('show_name')[dv].max().sort_values())

In [None]:
pd.DataFrame(mt.confusion_matrix(data_train[dv], data_train['dvpred']))

In [None]:
fpr, tpr, thresh = mt.roc_curve(data_train[dv], data_train['dvpred_proba'], pos_label=1)
auc = mt.auc(fpr, tpr)

fig, ax = plt.subplots()

ax.plot(fpr, tpr, color='darkorange',
        lw=2, label='ROC curve (area = %0.3f)' % (auc,))
ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])

ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')

ax.set_title('ROC curve')
ax.legend(loc='lower right')

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

axes[0].set_title('Ground truth')
axes[1].set_title('Predicted probabilities')

axes[0].set_xlim(0, 1)
axes[1].set_xlim(0, 1)

_ = data_train[dv].hist(bins=50, ax=axes[0])
_ = data_train['dvpred_proba'].hist(bins=50, ax=axes[1])

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

axes[0].set_title('y = 0')
axes[1].set_title('y = 1')

axes[0].set_xlim(0, 1)
axes[1].set_xlim(0, 1)

_ = data_train.loc[data_train[dv] == 0, 'dvpred_proba'].hist(bins=50, ax=axes[0])
_ = data_train.loc[data_train[dv] == 1, 'dvpred_proba'].hist(bins=50, ax=axes[1])

In [None]:
fig, ax = plt.subplots()

ax.set_title('Mean predicted probability by date')

_ = data_train.groupby('date')['dvpred_proba'].mean().plot(ax=ax, rot=45)
_ = ax.set_ylim(0, 1)

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(15, 10), sharex=True)

resid_train = data_train[dv] - data_train['dvpred_proba']
labels = data_train.groupby('show_id')['show_name'].max()

data_train['dvpred_proba'].groupby(data_train['show_id']).mean().plot(kind='bar', ax=axes[0])
resid_train.groupby(data_train['show_id']).mean().plot(kind='bar', ax=axes[1])

axes[0].set_xticklabels(labels)
axes[1].set_xticklabels(labels)

axes[0].set_title('Predictions by show')
axes[1].set_title('Residuals by show')

# Out-of-sample diagnostics

In [None]:
with pd.option_context('display.max_rows', None):
    display(data_test.groupby('show_name')[dv].max().sort_values())

In [None]:
pd.DataFrame(mt.confusion_matrix(data_test[dv], data_test['dvpred']))

In [None]:
fpr, tpr, thresh = mt.roc_curve(data_test[dv], data_test['dvpred_proba'], pos_label=1)
auc = mt.auc(fpr, tpr)

fig, ax = plt.subplots()

ax.plot(fpr, tpr, color='darkorange',
        lw=2, label='ROC curve (area = %0.3f)' % (auc,))
ax.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')

ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])

ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')

ax.set_title('ROC curve')
ax.legend(loc='lower right')

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

axes[0].set_title('Ground truth')
axes[1].set_title('Predicted probabilities')

axes[0].set_xlim(0, 1)
axes[1].set_xlim(0, 1)

_ = data_test[dv].hist(bins=50, ax=axes[0])
_ = data_test['dvpred_proba'].hist(bins=50, ax=axes[1])

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

axes[0].set_title('y = 0')
axes[1].set_title('y = 1')

axes[0].set_xlim(0, 1)
axes[1].set_xlim(0, 1)

_ = data_test.loc[data_test[dv] == 0, 'dvpred_proba'].hist(bins=50, ax=axes[0])
_ = data_test.loc[data_test[dv] == 1, 'dvpred_proba'].hist(bins=50, ax=axes[1])

In [None]:
fig, ax = plt.subplots()

ax.set_title('Mean predicted probability by date')

_ = data_test.groupby('date')['dvpred_proba'].mean().plot(ax=ax, rot=45)
_ = ax.set_ylim(0, 1)

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(15, 10), sharex=True)

resid_test = data_test[dv] - data_test['dvpred_proba']
labels = data_test.groupby('show_id')['show_name'].max()

data_test['dvpred_proba'].groupby(data_test['show_id']).mean().plot(kind='bar', ax=axes[0])
resid_test.groupby(data_test['show_id']).mean().plot(kind='bar', ax=axes[1])

axes[0].set_xticklabels(labels)
axes[1].set_xticklabels(labels)

axes[0].set_title('Predictions by show')
axes[1].set_title('Residuals by show')

# Feature importances

In [None]:
def features_and_coefs(data, dv, model, content='content', vocabulary=None):
    ##
    ## Prep the features
    ##
    
    words = te.TfidfVectorizer(
        input='content',
        sublinear_tf=True,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{1,}',
        ngram_range=(1, 2),
        vocabulary=vocabulary,
        
        max_features=10000
    )

    scaler = pr.StandardScaler()

    vecs = words.fit_transform(data[content])
    vecs = np.asarray(vecs.todense())
    vecs = scaler.fit_transform(vecs)

    ##
    ## Fit models for feature importances
    ##
    
    model.fit(vecs, data[dv])
    
    ##
    ## Build return dataset
    ##
    
    features = pd.DataFrame(pd.Series(words.vocabulary_, name='ind')) \
                   .reset_index() \
                   .rename({'index': 'ngram'}, axis=1) \
                   .sort_values('ind')

    if sk.base.is_regressor(model):
        features['coef_' + dv] = model.coef_
    elif data[dv].nunique() > 2:
        for i, c in enumerate(model.classes_):
            features['coef_' + dv + '_' + str(c)] = model.coef_[i, :]
    else:
        features['coef_' + dv] = model.coef_.T
    
    return features

In [None]:
features = features_and_coefs(data, dv=dv, vocabulary=vocab, model=dict(model.steps)['train'])

## View top features

In [None]:
k = 300

topk = []
botk = []

for v in list(set(list(features)) - set(['ngram', 'ind'])):
    tk = features.sort_values(v, ascending=False)
    tk = tk.loc[:, ['ngram', v]]
    tk = tk.rename({v: 'coef'}, axis=1)
    tk['dv'] = v
    topk += [tk.head(k)]

    bk = features.sort_values(v, ascending=True)
    bk = bk.loc[:, ['ngram', v]]
    bk = bk.rename({v: 'coef'}, axis=1)
    bk['dv'] = v
    botk += [bk.head(k)]
    
topk = pd.concat(topk, axis=0)
botk = pd.concat(botk, axis=0)

topk = topk.drop_duplicates()
botk = botk.drop_duplicates()

In [None]:
with pd.option_context('display.float_format', lambda x: '%.15f' % x), pd.option_context('display.max_rows', None):
    display(topk)

In [None]:
with pd.option_context('display.float_format', lambda x: '%.15f' % x), pd.option_context('display.max_rows', None):
    display(botk)