# Models

## Import dependencies

In [10]:
import pandas as pd
import numpy as np
import joblib

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, VotingRegressor

from sklearn.decomposition import PCA

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupShuffleSplit, train_test_split
from sklearn.inspection import permutation_importance
from sklearn.metrics import classification_report, average_precision_score

from scipy.stats import mstats

import matplotlib.pyplot as plt
import altair as alt

import random

## Import dataset

In [11]:
df = pd.read_csv("datasets/features_with_politeness.csv", lineterminator='\n', encoding="ISO-8859-1").dropna()
df['q_score'] = df['answer_score'] - df['a_score_rel_q_score']
df.head()

Unnamed: 0,question_id,answer_id,a_score_rel_q_score,answer_score,entities_matches,reputation,reply_by_author,len_answer_text,is_accepted,code_snippet_count,...,Bare.Command,WH.Questions,YesNo.Questions,Gratitude,Apology,Truth.Intensifier,Affirmation,Adverb.Just,Conjunction.Start,q_score
0,22528,22594,-6,5,1,17497.0,False,628,False,0,...,0,0,0,0,0,0,0,0,0,11
1,22528,22587,2,13,0,17787.0,False,699,True,0,...,0,0,0,0,0,1,0,0,1,11
2,22528,22578,-11,0,1,21216.0,False,338,False,1,...,2,0,0,0,0,0,0,1,0,11
3,22528,22551,-10,1,2,480242.0,False,422,False,0,...,0,0,0,0,0,0,0,0,0,11
4,22528,22537,-11,0,0,10583.0,False,318,False,0,...,1,1,1,0,0,0,0,0,0,11


In [12]:
features = [
       'entities_matches', 'reputation', 'reply_by_author', 'len_answer_text', 
       'code_snippet_count', 'link_count', 'Hedges',
       'Positive.Emotion', 'Negative.Emotion', 'Impersonal.Pronoun',
       'Swearing', 'Negation', 'Filler.Pause', 'Informal.Title',
       'Formal.Title', 'Could.You', 'Can.You', 'By.The.Way', 'Let.Me.Know',
       'Goodbye', 'For.Me', 'For.You', 'Reasoning', 'Reassurance',
       'Ask.Agency', 'Give.Agency', 'Hello', 'Please', 'First.Person.Plural',
       'First.Person.Single', 'Second.Person', 'Agreement', 'Acknowledgement',
       'Subjectivity', 'Bare.Command', 'WH.Questions', 'YesNo.Questions',
       'Gratitude', 'Apology', 'Truth.Intensifier', 'Affirmation',
       'Adverb.Just', 'Conjunction.Start', 'q_score']

In [13]:
scaler = StandardScaler()
scaler.fit(df[features])
X = scaler.transform(df[features])
X = pd.DataFrame(X,columns=features)
y = df['answer_score']

In [14]:
X

Unnamed: 0,entities_matches,reputation,reply_by_author,len_answer_text,code_snippet_count,link_count,Hedges,Positive.Emotion,Negative.Emotion,Impersonal.Pronoun,...,Bare.Command,WH.Questions,YesNo.Questions,Gratitude,Apology,Truth.Intensifier,Affirmation,Adverb.Just,Conjunction.Start,q_score
0,0.708131,-0.316037,-0.185487,0.402484,-0.535847,-0.462886,0.00201,-0.388082,1.185952,0.081510,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037,-0.170743
1,-0.425816,-0.314292,-0.185487,0.548692,-0.535847,0.286970,-0.63629,0.551563,-0.228104,1.129112,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,1.706875,-0.169873,-0.396733,1.119841,-0.170743
2,0.708131,-0.293650,-0.185487,-0.194703,0.540934,-0.462886,1.91691,0.864778,0.125410,1.303712,...,2.417250,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,1.412784,-0.407037,-0.170743
3,1.842078,2.469529,-0.185487,-0.021724,-0.535847,-0.462886,0.00201,0.551563,-0.581618,0.256110,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037,-0.170743
4,-0.425816,-0.357657,-0.185487,-0.235888,-0.535847,-0.462886,-0.63629,-0.701297,-0.581618,-0.442291,...,1.001615,3.265442,1.337252,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037,-0.170743
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10157,-0.425816,0.002162,-0.185487,-0.328555,-0.535847,-0.462886,-0.63629,-0.701297,-0.228104,-0.267691,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,1.706875,-0.169873,-0.396733,-0.407037,0.369809
10158,0.708131,0.240396,-0.185487,1.065568,-0.535847,0.286970,0.64031,1.491207,0.478924,1.652913,...,-0.414020,-0.194408,1.337252,-0.114404,-0.083912,5.771376,-0.169873,-0.396733,2.646720,-0.189600
10159,0.708131,7.138310,-0.185487,-0.202940,-0.535847,-0.462886,0.64031,-0.701297,0.832438,0.605311,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037,-0.189600
10160,-0.425816,-0.071838,-0.185487,-0.182347,0.540934,0.286970,-0.63629,-0.388082,-0.581618,0.256110,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,1.119841,-0.168648


In [15]:
y

0         5
1        13
2         0
3         1
4         0
         ..
10482    20
10483     2
10484     1
10485     8
10486     1
Name: answer_score, Length: 10162, dtype: int64

## Attempt PCA

In [8]:
# X2 = X.copy()

# # Bin answer scores for categorical visualization
# bins = [-20, 0, 2, 5, 7500]
# target,width_bins = pd.cut(y, bins, retbins=True, duplicates='drop')

# pca = PCA(n_components=2)
# Xnew = pd.DataFrame(pca.fit_transform(X2),columns=["PC1","PC2"])
# print('Explained variance ratio')
# print(pca.explained_variance_ratio_)

# source = Xnew.copy()
# source['answer_score_bins'] = target.astype(str)
# source.dropna(inplace=True)

# alt.Chart(source.sample(n=5000)).mark_point().encode(
#     x='PC1',
#     y='PC2',
#     color='answer_score_bins'
# )

Explained variance ratio
[0.14940757 0.03443175]


Explained variance ratio
[0.14940757 0.03446734]

In [16]:
# Setup variables for boxplots

top_scores = None
top_three_scores = None
map_scores = None
random_state = -1

In [43]:
from IPython.display import Javascript
random_state += 1
Javascript('IPython.notebook.execute_cells_below()')

<IPython.core.display.Javascript object>

## Train-test split

In [44]:
gss = GroupShuffleSplit(n_splits=2, train_size=0.8, random_state=random_state)
train_ix, val_ix = next(gss.split(X, y, groups=df['question_id']))
    
X_train = X.iloc[train_ix]
y_train = y.iloc[train_ix]

X_val = X.iloc[val_ix]
y_val = y.iloc[val_ix]

## Support Vector Regression

In [45]:
def fit_SVR(X_train, y_train):
    svr = SVR(kernel='linear')
    return svr.fit(X_train, y_train)

## Linear Regression

In [46]:
def fit_LR(X_train, y_train):
    lr = LinearRegression()
    return lr.fit(X_train, y_train)

## Random Forest Regressor

In [47]:
def fit_RF(X_train, y_train):
    rf = RandomForestRegressor(random_state=random_state, n_estimators=1000)
    return rf.fit(X_train, y_train)

#TODO @Riley: Add code and plot for hyperparameter tuning n_estimators 

## Feature Importance

In [48]:
# # SVR
# r = permutation_importance(fit_SVR(X_train, y_train), X_val, y_val, n_repeats=20, random_state=0)
# for i in r.importances_mean.argsort()[::-1]:
#     if r.importances_mean[i] > 0:
#         print(f"{features[i]:<8}"
#         f"{r.importances_mean[i]:.3f}")
    
# # LR
# r = permutation_importance(fit_LR(X_train, y_train), X_val, y_val, n_repeats=20, random_state=0)
# for i in r.importances_mean.argsort()[::-1]:
#     if r.importances_mean[i] > 0:
#         print(f"{features[i]:<8}"
#         f"{r.importances_mean[i]:.3f}")
   
# # RF
# r = permutation_importance(fit_RF(X_train, y_train), X_val, y_val, n_repeats=20, random_state=0)
# for i in r.importances_mean.argsort()[::-1]:
#     if r.importances_mean[i] > 0:
#         print(f"{features[i]:<8}"
#         f"{r.importances_mean[i]:.3f}")

**Top features for SVR:** q_score, len_answer_text, code_snippet_count

**Top features for LR:** q_score, len_answer_text, code_snippet_count, For.You, Acknowledgement, Negative.Emotion, First.Person.Single, Ask.Agency, YesNo.Questions

**Top features for RF:** q_score, len_answer_text, code_snippet_count, For.You, Acknowledgement, Negative.Emotion, First.Person.Single, Ask.Agency, YesNo.Questions, reputation

## Weighted Ensemble Regressor

In [49]:
svr = SVR(kernel='linear')
lr = LinearRegression()
rf = RandomForestRegressor(random_state=1, n_estimators=100) #TODO @Riley: Change n_estimators to best value
learners = [('svr', svr), ('lr', lr), ('rf', rf)]

In [50]:
def fit_VR(learners, weights, X_train, y_train):
    vr = VotingRegressor(learners, weights=weights)
    return vr.fit(X_train, y_train)

## Predict `answer_score`

In [51]:
predictions_df = df.iloc[val_ix][['question_id', 'answer_id', 'answer_score']]

In [52]:
poss_weights = [('None', None), ('2:1:1', [1/2, 1/4, 1/4]), ('1:2:1', [1/4, 1/2, 1/4]), ('1:1:2', [1/4, 1/4, 1/2])]

In [53]:
for name, weights in poss_weights:
    predictions_df['vr({})_pred'.format(name)] = fit_VR(learners, weights, X_train, y_train).predict(X_val)
    
predictions_df['svr_pred'] = fit_SVR(X_train, y_train).predict(X_val)
predictions_df['lr_pred'] = fit_LR(X_train, y_train).predict(X_val)
predictions_df['rf_pred'] = fit_RF(X_train, y_train).predict(X_val)

In [54]:
predictions_df.head()

Unnamed: 0,question_id,answer_id,answer_score,vr(None)_pred,vr(2:1:1)_pred,vr(1:2:1)_pred,vr(1:1:2)_pred,svr_pred,lr_pred,rf_pred
31,24596,5677149,1,3.578828,3.362875,4.421989,2.951621,2.715015,6.95147,1.167
32,24596,24611,9,10.028467,8.379158,10.384893,11.321351,3.431231,11.454171,14.483
33,24596,4932284,0,9.768125,8.001526,8.979255,12.323593,2.701729,6.612645,17.915
34,24596,24707,59,40.717784,33.151148,46.118865,42.883338,10.451242,62.32211,49.071
35,24596,40194,11,18.590781,14.841514,18.075243,22.855586,3.593713,16.528629,31.297


## Convert predicted `answer_score` to `rank`

In [55]:
model_names = [name for name, model in learners] + ['vr({})'.format(name) for name, weights in poss_weights]

for name in model_names:
    predictions_df['{}_rank'.format(name)] = predictions_df.groupby('question_id')['{}_pred'.format(name)].rank(ascending=False)

predictions_df['actual_rank'] = predictions_df.groupby('question_id')['answer_score'].rank(ascending=False)

In [56]:
predictions_df.head()

Unnamed: 0,question_id,answer_id,answer_score,vr(None)_pred,vr(2:1:1)_pred,vr(1:2:1)_pred,vr(1:1:2)_pred,svr_pred,lr_pred,rf_pred,svr_rank,lr_rank,rf_rank,vr(None)_rank,vr(2:1:1)_rank,vr(1:2:1)_rank,vr(1:1:2)_rank,actual_rank
31,24596,5677149,1,3.578828,3.362875,4.421989,2.951621,2.715015,6.95147,1.167,9.0,8.0,22.0,14.0,13.0,12.0,19.0,17.5
32,24596,24611,9,10.028467,8.379158,10.384893,11.321351,3.431231,11.454171,14.483,4.0,6.0,8.0,5.0,5.0,5.0,7.0,7.0
33,24596,4932284,0,9.768125,8.001526,8.979255,12.323593,2.701729,6.612645,17.915,10.0,9.0,5.0,7.0,7.0,8.0,5.0,20.5
34,24596,24707,59,40.717784,33.151148,46.118865,42.883338,10.451242,62.32211,49.071,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
35,24596,40194,11,18.590781,14.841514,18.075243,22.855586,3.593713,16.528629,31.297,2.0,3.0,2.0,2.0,2.0,2.0,2.0,6.0


## Evaluate Performance

**Spearman r**

In [57]:
%%capture --no-stdout 
# ^ Throws warning when computing Correlation(predicted,actual) when obs=2

models_spearmanr = {name: [] for name in model_names}

def get_spearmanr(pred, actual):
    return mstats.spearmanr(pred, actual, use_ties=True).correlation

for model in models_spearmanr.keys():
    for question_id in predictions_df['question_id'].unique():
        mask = (predictions_df['question_id'] == question_id)
        if len(predictions_df[mask]) > 1:
            spearmanr = get_spearmanr(predictions_df[mask]['{}_pred'.format(model)], predictions_df[mask]['answer_score'])
            models_spearmanr[model].append(spearmanr)

models_spearmanr = {key: sum(val) / len(val) for key, val in models_spearmanr.items()}

sp_stats = pd.DataFrame.from_dict(models_spearmanr, orient='index', columns=['spearmanr'])
print(sp_stats) 

           spearmanr
svr         0.229102
lr          0.121937
rf          0.171549
vr(None)    0.165105
vr(2:1:1)   0.166594
vr(1:2:1)   0.151451
vr(1:1:2)   0.174761


**Of all questions, what proportion of best answers are predicted correctly if the ranks are randomly assigned?**

In [67]:
prop = 0
answer_rank = predictions_df[predictions_df['actual_rank'] == 1]
for i in range(5):
    random.seed(i)
    num_answers = predictions_df['question_id'].value_counts()
    predictions_df['rand_rank'] = predictions_df['question_id'].apply(lambda x: random.randint(1, num_answers[x])) 

    rand_rank = predictions_df[predictions_df['rand_rank'] == 1]
    prop += len(answer_rank.merge(rand_rank, how='inner', on='answer_id')) / len(predictions_df['question_id'].unique())
    
prop /= 5
prop

0.2881844380403459

**Of all best answers, what proportion did we predict correctly?**

In [59]:
mask = (predictions_df['actual_rank'] == 1) # Get answers where actual rank is 1
actual_best = set(predictions_df[mask].index)

models_best = {}
for name in model_names:
    mask = (predictions_df['{}_rank'.format(name)] == 1) # Get answers where predicted rank is 1
    predicted_best = set(predictions_df[mask].index)
    models_best[name] = len(actual_best.intersection(predicted_best)) / len(actual_best)

top_stats = pd.DataFrame.from_dict(models_best, orient='index', columns=['proportion'])
print(top_stats) 

           proportion
svr          0.496479
lr           0.436620
rf           0.422535
vr(None)     0.447183
vr(2:1:1)    0.450704
vr(1:2:1)    0.443662
vr(1:1:2)    0.443662


**Of all best answers, what proportion were in our top 3?**

In [60]:
mask = (predictions_df['actual_rank'] == 1) # Get answers where actual rank is 1
actual_best = set(predictions_df[mask].index)

models_best = {}
for name in model_names:
    mask = (predictions_df['{}_rank'.format(name)] < 4) # Get answers where predicted rank is in the top 3
    predicted_best = set(predictions_df[mask].index)
    models_best[name] = len(actual_best.intersection(predicted_best)) / len(actual_best)

top_three_stats = pd.DataFrame.from_dict(models_best, orient='index', columns=['proportion'])
print(top_three_stats) 

           proportion
svr          0.827465
lr           0.792254
rf           0.820423
vr(None)     0.813380
vr(2:1:1)    0.813380
vr(1:2:1)    0.799296
vr(1:1:2)    0.830986


**MAP**

In [61]:
predictions_df['actual_map'] = predictions_df['actual_rank'].apply(lambda x: 1 if x <= 3 else 0)

models_map = {}
for name in model_names:
    predictions_df['{}_map'.format(name)] = predictions_df['{}_rank'.format(name)].apply(lambda x: 1 if x <= 3 else 0)
    models_map[name] = average_precision_score(predictions_df['actual_map'], predictions_df['{}_map'.format(name)])

map_stats = pd.DataFrame.from_dict(models_map, orient='index', columns=['avg_precision_score'])
print(map_stats)

           avg_precision_score
svr                   0.653457
lr                    0.620536
rf                    0.666832
vr(None)              0.631706
vr(2:1:1)             0.631706
vr(1:2:1)             0.624233
vr(1:1:2)             0.643115


In [62]:
top_stats.reset_index(inplace=True)
top_three_stats.reset_index(inplace=True)
map_stats.reset_index(inplace=True)
top_stats.columns = ['Model', 'Proportion']
top_three_stats.columns = ['Model', 'Proportion']
map_stats.columns = ['Model', 'Proportion']

if top_scores is None:
    top_scores = top_stats.copy()
    top_three_scores = top_three_stats.copy()
    map_scores = map_stats.copy()
else:
    top_scores = pd.concat([top_scores, top_stats])
    map_scores = pd.concat([map_scores, map_stats])
    top_three_scores = pd.concat([top_three_scores, top_three_stats])

In [63]:
print(top_scores)

alt.Chart(top_scores).mark_boxplot().encode(
    alt.Y("Proportion:Q", scale=alt.Scale(domain=[0.3, 0.6])),
    x='Model',
).properties(width=300)

       Model  Proportion
0        svr    0.501629
1         lr    0.478827
2         rf    0.452769
3   vr(None)    0.495114
4  vr(2:1:1)    0.495114
5  vr(1:2:1)    0.498371
6  vr(1:1:2)    0.498371
0        svr    0.496479
1         lr    0.436620
2         rf    0.422535
3   vr(None)    0.447183
4  vr(2:1:1)    0.450704
5  vr(1:2:1)    0.443662
6  vr(1:1:2)    0.443662


In [64]:
print(top_three_scores)

alt.Chart(top_three_scores).mark_boxplot().encode(
    alt.Y("Proportion:Q", scale=alt.Scale(domain=[0.7, 1])),
    x='Model',
).properties(width=300)

       Model  Proportion
0        svr    0.840391
1         lr    0.820847
2         rf    0.824104
3   vr(None)    0.840391
4  vr(2:1:1)    0.843648
5  vr(1:2:1)    0.833876
6  vr(1:1:2)    0.850163
0        svr    0.827465
1         lr    0.792254
2         rf    0.820423
3   vr(None)    0.813380
4  vr(2:1:1)    0.813380
5  vr(1:2:1)    0.799296
6  vr(1:1:2)    0.830986


In [65]:
print(map_scores)

alt.Chart(map_scores).mark_boxplot().encode(
    alt.Y("Proportion:Q", scale=alt.Scale(domain=[0.5, 0.8])),
    x='Model',
).properties(width=300)

       Model  Proportion
0        svr    0.643072
1         lr    0.641817
2         rf    0.648759
3   vr(None)    0.650663
4  vr(2:1:1)    0.650663
5  vr(1:2:1)    0.648122
6  vr(1:1:2)    0.650663
0        svr    0.653457
1         lr    0.620536
2         rf    0.666832
3   vr(None)    0.631706
4  vr(2:1:1)    0.631706
5  vr(1:2:1)    0.624233
6  vr(1:1:2)    0.643115
