In [None]:
import os
import pickle
import random
import logging

import numpy as np
import pandas as pd

import matplotlib as mp
%matplotlib inline
import matplotlib.pyplot as plt

import sklearn as sk
import sklearn.metrics as mt
import sklearn.pipeline as pp
import sklearn.linear_model as lm
import sklearn.preprocessing as pr
import sklearn.model_selection as ms
import sklearn.feature_extraction.text as te

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_colwidth', None)

In [None]:
logger = logging.getLogger(__name__)

fmt = '%(asctime)s : %(levelname)s : %(message)s'
logging.basicConfig(format=fmt, level=logging.INFO)

logging.getLogger("gensim").setLevel(logging.WARNING)

In [None]:
os.chdir(os.path.expanduser('~/github/masthesis/'))

In [None]:
seed = 1511200828

random.seed(seed)
np.random.seed(seed)

# Prepare dataset

In [None]:
dv = 'dim0_hosts'

In [None]:
data = pd.read_csv('data/radio/show-pairs-content-with-twitter-metrics.csv')

display(data.shape)

In [None]:
vocab = pd.read_csv('data/radio/ngram-vocab.csv').word.tolist()

In [None]:
grp = ms.GroupShuffleSplit(n_splits=3, train_size=0.75, random_state=seed)

In [None]:
train_inds, test_inds = next(grp.split(data, groups=data.show_id))

data_train, data_test = data.iloc[train_inds, :].copy(), data.iloc[test_inds, :].copy()

# Modeling

In [None]:
model = pp.Pipeline(steps=[
    ('words', te.TfidfVectorizer(
        sublinear_tf=True,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{1,}',
        ngram_range=(1, 2),
        vocabulary=vocab,
        max_features=20000
    )),
        
    ('train', lm.LinearRegression(fit_intercept=True))
])

In [None]:
model.fit(data_train['content'], data_train[dv])

In [None]:
dvpred_train = pd.Series(model.predict(data_train['content']), index=data_train.index)
dvpred_test = pd.Series(model.predict(data_test['content']), index=data_test.index)

In [None]:
is_score = mt.r2_score(data_train[dv], dvpred_train)
oos_score = mt.r2_score(data_test[dv], dvpred_test)

print('In-sample: {0}'.format(is_score))
print('Out-of-sample: {0}'.format(oos_score))

# In-sample diagnostics

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

axes[0].set_title('Predicted vs actual')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')

axes[0].set_xlim(1.1 * min(-dvpred_train.std(), dvpred_train.min()), 1.1 * dvpred_train.max())
axes[0].set_ylim(1.1 * min(-data_train[dv].std(), data_train[dv].min()), 1.1 * data_train[dv].max())

axes[1].set_title('Residuals')

_ = axes[0].scatter(dvpred_train, data_train[dv], s=5, alpha=0.75, c='navy', lw=0.25)
_ = axes[1].hist(data_train[dv] - dvpred_train, bins=50)

In [None]:
fig, ax = plt.subplots()

resid_train = data_train[dv] - dvpred_train

ax.set_xlim(1.1 * min(-data_train[dv].std(), data_train[dv].min()), 1.1 * data_train[dv].max())
ax.set_ylim(1.1 * min(-resid_train.std(), resid_train.min()), 1.1 * resid_train.max())

ax.set_title('Residuals vs actual')

ax.scatter(data_train[dv], resid_train)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

axes[0].set_title('Actual')
axes[1].set_title('Predicted')

axes[0].set_xlim(0, 1.1 * data_train[dv].max())
axes[1].set_xlim(0, 1.1 * data_train[dv].max())

_ = data_train[dv].hist(bins=50, ax=axes[0])
_ = dvpred_train.hist(bins=50, ax=axes[1])

In [None]:
fig, ax = plt.subplots()

ax.set_title('Mean prediction by date')

pd.DataFrame(dvpred_train).groupby(data_train.date).mean().plot(ax=ax, rot=45)

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(15, 10), sharex=True)

labels = data_train.groupby('show_id')['show_name'].max()

data_train[dv].groupby(data_train['show_id']).mean().plot(kind='bar', ax=axes[0])
dvpred_train.groupby(data_train['show_id']).mean().plot(kind='bar', ax=axes[1])
resid_train.groupby(data_train['show_id']).mean().plot(kind='bar', ax=axes[2])

axes[0].set_xticklabels(labels, rotation=90)
axes[1].set_xticklabels(labels, rotation=90)
axes[2].set_xticklabels(labels, rotation=90)

axes[0].set_title('Actuals by show')
axes[1].set_title('Predictions by show')
axes[2].set_title('Residuals by show')

# Out-of-sample diagnostics

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

axes[0].set_title('Predicted vs actual')
axes[0].set_xlabel('Predicted')
axes[0].set_ylabel('Actual')
axes[0].set_xlim(1.1 * dvpred_test.min(), 1.1 * dvpred_test.max())
axes[0].set_ylim(1.1 * min(-data_test[dv].std(), data_test[dv].min()), 1.1 * data_test[dv].max())

axes[1].set_title('Residuals')

_ = axes[0].scatter(dvpred_test, data_test[dv], s=5, alpha=0.75, c='navy', lw=0.25)
_ = axes[1].hist(data_test[dv] - dvpred_test, bins=50)

In [None]:
fig, ax = plt.subplots()

resid_test = data_test[dv] - dvpred_test

ax.set_xlim(1.1 * min(-data_test[dv].std(), data_test[dv].min()), 1.1 * data_test[dv].max())
ax.set_ylim(1.1 * min(-resid_test.std(), resid_test.min()), 1.1 * resid_test.max())

axes[1].set_title('Residuals vs actual')

ax.scatter(data_test[dv], resid_test)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

axes[0].set_title('Actual')
axes[1].set_title('Predicted')

axes[0].set_xlim(0, 1.1 * data_test[dv].max())
axes[1].set_xlim(0, 1.1 * data_test[dv].max())

_ = data_train[dv].hist(bins=50, ax=axes[0])
_ = dvpred_test.hist(bins=50, ax=axes[1])

In [None]:
fig, ax = plt.subplots()

ax.set_title('Mean prediction by date')

pd.DataFrame(dvpred_test).groupby(data_test.date).mean().plot(ax=ax, rot=45)

In [None]:
fig, axes = plt.subplots(3, 1, figsize=(15, 10), sharex=True)

labels = data_test.groupby('show_id')['show_name'].max()

data_test[dv].groupby(data_test['show_id']).mean().plot(kind='bar', ax=axes[0])
dvpred_test.groupby(data_test['show_id']).mean().plot(kind='bar', ax=axes[1])
resid_test.groupby(data_test['show_id']).mean().plot(kind='bar', ax=axes[2])

axes[0].set_xticklabels(labels, rotation=90)
axes[1].set_xticklabels(labels, rotation=90)
axes[2].set_xticklabels(labels, rotation=90)

axes[0].set_title('Actuals by show')
axes[1].set_title('Predictions by show')
axes[2].set_title('Residuals by show')

# Feature importances

In [None]:
def features_and_coefs(data, dv, model, content='content', vocabulary=None):
    ##
    ## Prep the features
    ##
    
    words = te.TfidfVectorizer(
        input='content',
        sublinear_tf=True,
        strip_accents='unicode',
        analyzer='word',
        token_pattern=r'\w{1,}',
        ngram_range=(1, 2),
        vocabulary=vocabulary,
        
        max_features=10000
    )

    scaler = pr.StandardScaler()

    vecs = words.fit_transform(data[content])
    vecs = np.asarray(vecs.todense())
    vecs = scaler.fit_transform(vecs)

    ##
    ## Fit models for feature importances
    ##
    
    model.fit(vecs, data[dv])
    
    ##
    ## Build return dataset
    ##
    
    features = pd.DataFrame(pd.Series(words.vocabulary_, name='ind')) \
                   .reset_index() \
                   .rename({'index': 'ngram'}, axis=1) \
                   .sort_values('ind')

    if sk.base.is_regressor(model):
        features['coef_' + dv] = model.coef_
    elif data[dv].nunique() > 2:
        for i, c in enumerate(model.classes_):
            features['coef_' + dv + '_' + str(c)] = model.coef_[i, :]
    else:
        features['coef_' + dv] = model.coef_.T
    
    return features

In [None]:
features = features_and_coefs(data, dv=dv, vocabulary=vocab, model=dict(model.steps)['train'])

## View top features

In [None]:
k = 300

topk = []
botk = []

for v in list(set(list(features)) - set(['ngram', 'ind'])):
    tk = features.sort_values(v, ascending=False)
    tk = tk.loc[:, ['ngram', v]]
    tk = tk.rename({v: 'coef'}, axis=1)
    tk['dv'] = v
    topk += [tk.head(k)]

    bk = features.sort_values(v, ascending=True)
    bk = bk.loc[:, ['ngram', v]]
    bk = bk.rename({v: 'coef'}, axis=1)
    bk['dv'] = v
    botk += [bk.head(k)]
    
topk = pd.concat(topk, axis=0)
botk = pd.concat(botk, axis=0)

topk = topk.drop_duplicates()
botk = botk.drop_duplicates()

In [None]:
with pd.option_context('display.float_format', lambda x: '%.15f' % x), pd.option_context('display.max_rows', None):
    display(topk)

In [None]:
with pd.option_context('display.float_format', lambda x: '%.15f' % x), pd.option_context('display.max_rows', None):
    display(botk)

In [None]:
tmpt = topk.groupby('dv').apply(lambda x: x.sample(n=5)) \
           .drop('dv', axis=1).reset_index() \
           .drop('level_1', axis=1) \
           .drop('coef', axis=1)
tmpt['pos'] = tmpt.groupby('dv').cumcount() + 1
tmpt = tmpt.set_index(['pos', 'dv']).unstack('dv')
tmpt.columns = [x[1] for x in tmpt.columns]

tmpb = botk.groupby('dv').apply(lambda x: x.sample(n=5)) \
           .drop('dv', axis=1).reset_index() \
           .drop('level_1', axis=1) \
           .drop('coef', axis=1)
tmpb['pos'] = tmpb.groupby('dv').cumcount() + 1
tmpb = tmpb.set_index(['pos', 'dv']).unstack('dv')
tmpb.columns = [x[1] for x in tmpb.columns]

tmp = pd.concat([tmpt, tmpb], axis=1)
tmp.columns = ['top', 'bottom']

print(tmp.to_latex(index=False))