# Models

## Import dependencies

In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor, VotingRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupShuffleSplit, train_test_split
from sklearn.inspection import permutation_importance
from sklearn.metrics import classification_report, average_precision_score

from scipy.stats import mstats

## Import dataset

In [2]:
df = pd.read_csv("datasets/features_with_politeness.csv", lineterminator='\n', encoding="ISO-8859-1").dropna()
df['q_score'] = df['answer_score'] - df['a_score_rel_q_score']
df.head()

Unnamed: 0,question_id,answer_id,a_score_rel_q_score,answer_score,entities_matches,reputation,reply_by_author,len_answer_text,is_accepted,code_snippet_count,...,Bare.Command,WH.Questions,YesNo.Questions,Gratitude,Apology,Truth.Intensifier,Affirmation,Adverb.Just,Conjunction.Start,q_score
0,22528,22594,-6,5,1,17497.0,False,628,False,0,...,0,0,0,0,0,0,0,0,0,11
1,22528,22587,2,13,0,17787.0,False,699,True,0,...,0,0,0,0,0,1,0,0,1,11
2,22528,22578,-11,0,1,21216.0,False,338,False,1,...,2,0,0,0,0,0,0,1,0,11
3,22528,22551,-10,1,2,480242.0,False,422,False,0,...,0,0,0,0,0,0,0,0,0,11
4,22528,22537,-11,0,0,10583.0,False,318,False,0,...,1,1,1,0,0,0,0,0,0,11


In [3]:
features = [
       'entities_matches', 'reputation', 'reply_by_author', 'len_answer_text', 
       'code_snippet_count', 'link_count', 'Hedges',
       'Positive.Emotion', 'Negative.Emotion', 'Impersonal.Pronoun',
       'Swearing', 'Negation', 'Filler.Pause', 'Informal.Title',
       'Formal.Title', 'Could.You', 'Can.You', 'By.The.Way', 'Let.Me.Know',
       'Goodbye', 'For.Me', 'For.You', 'Reasoning', 'Reassurance',
       'Ask.Agency', 'Give.Agency', 'Hello', 'Please', 'First.Person.Plural',
       'First.Person.Single', 'Second.Person', 'Agreement', 'Acknowledgement',
       'Subjectivity', 'Bare.Command', 'WH.Questions', 'YesNo.Questions',
       'Gratitude', 'Apology', 'Truth.Intensifier', 'Affirmation',
       'Adverb.Just', 'Conjunction.Start', 'q_score']

In [4]:
scaler = StandardScaler()
scaler.fit(df[features])
X = scaler.transform(df[features])
X = pd.DataFrame(X,columns=features)
y = df['answer_score']

In [5]:
X

Unnamed: 0,entities_matches,reputation,reply_by_author,len_answer_text,code_snippet_count,link_count,Hedges,Positive.Emotion,Negative.Emotion,Impersonal.Pronoun,...,Bare.Command,WH.Questions,YesNo.Questions,Gratitude,Apology,Truth.Intensifier,Affirmation,Adverb.Just,Conjunction.Start,q_score
0,0.708131,-0.316037,-0.185487,0.402484,-0.535847,-0.462886,0.00201,-0.388082,1.185952,0.081510,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037,-0.170743
1,-0.425816,-0.314292,-0.185487,0.548692,-0.535847,0.286970,-0.63629,0.551563,-0.228104,1.129112,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,1.706875,-0.169873,-0.396733,1.119841,-0.170743
2,0.708131,-0.293650,-0.185487,-0.194703,0.540934,-0.462886,1.91691,0.864778,0.125410,1.303712,...,2.417250,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,1.412784,-0.407037,-0.170743
3,1.842078,2.469529,-0.185487,-0.021724,-0.535847,-0.462886,0.00201,0.551563,-0.581618,0.256110,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037,-0.170743
4,-0.425816,-0.357657,-0.185487,-0.235888,-0.535847,-0.462886,-0.63629,-0.701297,-0.581618,-0.442291,...,1.001615,3.265442,1.337252,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037,-0.170743
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10157,-0.425816,0.002162,-0.185487,-0.328555,-0.535847,-0.462886,-0.63629,-0.701297,-0.228104,-0.267691,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,1.706875,-0.169873,-0.396733,-0.407037,0.369809
10158,0.708131,0.240396,-0.185487,1.065568,-0.535847,0.286970,0.64031,1.491207,0.478924,1.652913,...,-0.414020,-0.194408,1.337252,-0.114404,-0.083912,5.771376,-0.169873,-0.396733,2.646720,-0.189600
10159,0.708131,7.138310,-0.185487,-0.202940,-0.535847,-0.462886,0.64031,-0.701297,0.832438,0.605311,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037,-0.189600
10160,-0.425816,-0.071838,-0.185487,-0.182347,0.540934,0.286970,-0.63629,-0.388082,-0.581618,0.256110,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,1.119841,-0.168648


In [6]:
y

0         5
1        13
2         0
3         1
4         0
         ..
10482    20
10483     2
10484     1
10485     8
10486     1
Name: answer_score, Length: 10162, dtype: int64

## Train-test split

In [7]:
gss = GroupShuffleSplit(n_splits=2, train_size=0.8, random_state=0)
train_ix, val_ix = next(gss.split(X, y, groups=df['question_id']))
    
X_train = X.iloc[train_ix]
y_train = y.iloc[train_ix]

X_val = X.iloc[val_ix]
y_val = y.iloc[val_ix]

## Support Vector Regression

In [8]:
def fit_SVR(X_train, y_train):
    svr = SVR(kernel='linear')
    return svr.fit(X_train, y_train)

## Linear Regression

In [9]:
def fit_LR(X_train, y_train):
    lr = LinearRegression()
    return lr.fit(X_train, y_train)

## Random Forest Regressor

In [10]:
def fit_RF(X_train, y_train):
    rf = RandomForestRegressor(random_state=0, n_estimators=1000)
    return rf.fit(X_train, y_train)

#TODO @Riley: Add code and plot for hyperparameter tuning n_estimators 

## Feature Importance

In [11]:
# # SVR
# r = permutation_importance(fit_SVR(X_train, y_train), X_val, y_val, n_repeats=20, random_state=0)
# for i in r.importances_mean.argsort()[::-1]:
#     if r.importances_mean[i] > 0:
#         print(f"{features[i]:<8}"
#         f"{r.importances_mean[i]:.3f}")
    
# # LR
# r = permutation_importance(fit_LR(X_train, y_train), X_val, y_val, n_repeats=20, random_state=0)
# for i in r.importances_mean.argsort()[::-1]:
#     if r.importances_mean[i] > 0:
#         print(f"{features[i]:<8}"
#         f"{r.importances_mean[i]:.3f}")
   
# # RF
# r = permutation_importance(fit_RF(X_train, y_train), X_val, y_val, n_repeats=20, random_state=0)
# for i in r.importances_mean.argsort()[::-1]:
#     if r.importances_mean[i] > 0:
#         print(f"{features[i]:<8}"
#         f"{r.importances_mean[i]:.3f}")

**Top features for SVR:** q_score, len_answer_text, code_snippet_count

**Top features for LR:** q_score, len_answer_text, code_snippet_count, For.You, Acknowledgement, Negative.Emotion, First.Person.Single, Ask.Agency, YesNo.Questions

**Top features for RF:** q_score, len_answer_text, code_snippet_count, For.You, Acknowledgement, Negative.Emotion, First.Person.Single, Ask.Agency, YesNo.Questions, reputation

## Weighted Ensemble Regressor

In [12]:
svr = SVR(kernel='linear')
lr = LinearRegression()
rf = RandomForestRegressor(random_state=0, n_estimators=100) #TODO @Riley: Change n_estimators to best value
learners = [('svr', svr), ('lr', lr), ('rf', rf)]

In [13]:
def fit_VR(learners, weights, X_train, y_train):
    vr = VotingRegressor(learners, weights=weights)
    return vr.fit(X_train, y_train)

## Predict `answer_score`

In [14]:
predictions_df = df.iloc[val_ix][['question_id', 'answer_id', 'answer_score']]

In [15]:
poss_weights = [('None', None), ('2:1:1', [1/2, 1/4, 1/4]), ('1:2:1', [1/4, 1/2, 1/4]), ('1:1:2', [1/4, 1/4, 1/2])]

In [16]:
for name, weights in poss_weights:
    predictions_df['vr({})_pred'.format(name)] = fit_VR(learners, weights, X_train, y_train).predict(X_val)
    
predictions_df['svr_pred'] = fit_SVR(X_train, y_train).predict(X_val)
predictions_df['lr_pred'] = fit_LR(X_train, y_train).predict(X_val)
predictions_df['rf_pred'] = fit_RF(X_train, y_train).predict(X_val)

In [17]:
predictions_df.head()

Unnamed: 0,question_id,answer_id,answer_score,vr(None)_pred,vr(2:1:1)_pred,vr(1:2:1)_pred,vr(1:1:2)_pred,svr_pred,lr_pred,rf_pred
90,22577,13023680,1,2.170639,2.10329,2.518148,1.890479,1.901243,3.560674,1.039
92,22577,23943,0,3.648978,3.225694,4.362006,3.359233,1.955843,6.50109,2.349
93,22577,22637,0,-1.430921,-0.890699,-2.493873,-0.908191,0.729966,-5.682729,0.812
116,22590,22653,1,0.905182,1.182178,-0.048018,1.581387,2.013164,-2.907617,3.46
117,22590,22641,0,3.781381,2.930164,4.602943,3.811036,0.376514,7.06763,3.301


## Convert predicted `answer_score` to `rank`

In [18]:
model_names = [name for name, model in learners] + ['vr({})'.format(name) for name, weights in poss_weights]

for name in model_names:
    predictions_df['{}_rank'.format(name)] = predictions_df.groupby('question_id')['{}_pred'.format(name)].rank(ascending=False)

predictions_df['actual_rank'] = predictions_df.groupby('question_id')['answer_score'].rank(ascending=False)

In [19]:
predictions_df.head()

Unnamed: 0,question_id,answer_id,answer_score,vr(None)_pred,vr(2:1:1)_pred,vr(1:2:1)_pred,vr(1:1:2)_pred,svr_pred,lr_pred,rf_pred,svr_rank,lr_rank,rf_rank,vr(None)_rank,vr(2:1:1)_rank,vr(1:2:1)_rank,vr(1:1:2)_rank,actual_rank
90,22577,13023680,1,2.170639,2.10329,2.518148,1.890479,1.901243,3.560674,1.039,2.0,2.0,2.0,2.0,2.0,2.0,2.0,1.0
92,22577,23943,0,3.648978,3.225694,4.362006,3.359233,1.955843,6.50109,2.349,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.5
93,22577,22637,0,-1.430921,-0.890699,-2.493873,-0.908191,0.729966,-5.682729,0.812,3.0,3.0,3.0,3.0,3.0,3.0,3.0,2.5
116,22590,22653,1,0.905182,1.182178,-0.048018,1.581387,2.013164,-2.907617,3.46,1.0,2.0,1.0,2.0,2.0,2.0,2.0,1.5
117,22590,22641,0,3.781381,2.930164,4.602943,3.811036,0.376514,7.06763,3.301,4.0,1.0,2.0,1.0,1.0,1.0,1.0,3.5


## Evaluate Performance

**Spearman r**

In [20]:
%%capture --no-stdout 
# ^ Throws warning when computing Correlation(predicted,actual) when obs=2

models_spearmanr = {name: [] for name in model_names}

def get_spearmanr(pred, actual):
    return mstats.spearmanr(pred, actual, use_ties=True).correlation

for model in models_spearmanr.keys():
    for question_id in predictions_df['question_id'].unique():
        mask = (predictions_df['question_id'] == question_id)
        if len(predictions_df[mask]) > 1:
            spearmanr = get_spearmanr(predictions_df[mask]['{}_pred'.format(model)], predictions_df[mask]['answer_score'])
            models_spearmanr[model].append(spearmanr)

models_spearmanr = {key: sum(val) / len(val) for key, val in models_spearmanr.items()}

# Must print() because of capture magic
print(pd.DataFrame.from_dict(models_spearmanr, orient='index', columns=['spearmanr'])) 

           spearmanr
svr         0.263366
lr          0.215870
rf          0.216942
vr(None)    0.241107
vr(2:1:1)   0.241749
vr(1:2:1)   0.233321
vr(1:1:2)   0.255546


**Of all best answers, what proportion did we predict correctly?**

In [21]:
mask = (predictions_df['actual_rank'] == 1) # Get answers where actual rank is 1
actual_best = set(predictions_df[mask].index)

models_best = {}
for name in model_names:
    mask = (predictions_df['{}_rank'.format(name)] == 1) # Get answers where predicted rank is 1
    predicted_best = set(predictions_df[mask].index)
    models_best[name] = len(actual_best.intersection(predicted_best)) / len(actual_best)

print(pd.DataFrame.from_dict(models_best, orient='index', columns=['proportion'])) 

# TODO @Chloe: Add lineplot

           proportion
svr          0.501629
lr           0.478827
rf           0.452769
vr(None)     0.491857
vr(2:1:1)    0.488599
vr(1:2:1)    0.495114
vr(1:1:2)    0.498371


**Of all best answers, what proportion were in our top 3?**

In [22]:
mask = (predictions_df['actual_rank'] == 1) # Get answers where actual rank is 1
actual_best = set(predictions_df[mask].index)

models_best = {}
for name in model_names:
    mask = (predictions_df['{}_rank'.format(name)] < 4) # Get answers where predicted rank is 1
    predicted_best = set(predictions_df[mask].index)
    models_best[name] = len(actual_best.intersection(predicted_best)) / len(actual_best)

print(pd.DataFrame.from_dict(models_best, orient='index', columns=['proportion'])) 

# TODO @Chloe: Add lineplot

           proportion
svr          0.840391
lr           0.820847
rf           0.824104
vr(None)     0.846906
vr(2:1:1)    0.843648
vr(1:2:1)    0.833876
vr(1:1:2)    0.850163


**MAP**

In [23]:
predictions_df['actual_map'] = predictions_df['actual_rank'].apply(lambda x: 1 if x <= 3 else 0)

models_map = {}
for name in model_names:
    predictions_df['{}_map'.format(name)] = predictions_df['{}_rank'.format(name)].apply(lambda x: 1 if x <= 3 else 0)
    models_map[name] = average_precision_score(predictions_df['actual_map'], predictions_df['{}_map'.format(name)])

print(pd.DataFrame.from_dict(models_map, orient='index', columns=['avg_precision_score']))

# TODO @Chloe: Add lineplot 

           avg_precision_score
svr                   0.643072
lr                    0.641817
rf                    0.648759
vr(None)              0.654496
vr(2:1:1)             0.651938
vr(1:2:1)             0.651938
vr(1:1:2)             0.651938
