# Models

## Import dependencies

In [1]:
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupShuffleSplit
from sklearn.linear_model import Ridge
from sklearn.inspection import permutation_importance

from scipy import stats
import random

## Import dataset

In [2]:
df = pd.read_csv("datasets/features_with_politeness.csv", encoding="ISO-8859-1").dropna()

## Explore dataset

In [3]:
print(df.shape)
df.head()

(10162, 48)


Unnamed: 0,question_id,answer_id,a_score_rel_q_score,answer_score,entities_matches,reputation,reply_by_author,len_answer_text,is_accepted,code_snippet_count,...,Subjectivity,Bare.Command,WH.Questions,YesNo.Questions,Gratitude,Apology,Truth.Intensifier,Affirmation,Adverb.Just,Conjunction.Start
0,22528,22594,-6,5,1,17497.0,False,628,False,0,...,0,0,0,0,0,0,0,0,0,0
1,22528,22587,2,13,0,17787.0,False,699,True,0,...,0,0,0,0,0,0,1,0,0,1
2,22528,22578,-11,0,1,21216.0,False,338,False,1,...,0,2,0,0,0,0,0,0,1,0
3,22528,22551,-10,1,2,480242.0,False,422,False,0,...,0,0,0,0,0,0,0,0,0,0
4,22528,22537,-11,0,0,10583.0,False,318,False,0,...,0,1,1,1,0,0,0,0,0,0


## Standardize features

In [4]:
df['q_score'] = df['answer_score'] - df['a_score_rel_q_score']
features = ['q_score', 'entities_matches', 'reputation', 'reply_by_author', 'len_answer_text', 'code_snippet_count', 'link_count', 'Hedges',
       'Positive.Emotion', 'Negative.Emotion',
       'Swearing', 'Negation', 'Informal.Title', 'Reasoning', 'Reassurance', 'Please', 'Agreement', 'Acknowledgement',
       'Subjectivity', 'Bare.Command', 'WH.Questions', 'YesNo.Questions',
       'Gratitude', 'Apology', 'Truth.Intensifier', 'Affirmation']

In [5]:
scaler = StandardScaler()
scaler.fit(df)
X = scaler.transform(df)
X = pd.DataFrame(X,columns=df.columns)
X

Unnamed: 0,question_id,answer_id,a_score_rel_q_score,answer_score,entities_matches,reputation,reply_by_author,len_answer_text,is_accepted,code_snippet_count,...,Bare.Command,WH.Questions,YesNo.Questions,Gratitude,Apology,Truth.Intensifier,Affirmation,Adverb.Just,Conjunction.Start,q_score
0,-0.146259,-0.385637,0.162401,-0.056518,0.708131,-0.316037,-0.185487,0.402484,-0.544218,-0.535847,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037,-0.170743
1,-0.146259,-0.385638,0.179703,0.014073,-0.425816,-0.314292,-0.185487,0.548692,1.837498,-0.535847,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,1.706875,-0.169873,-0.396733,1.119841,-0.170743
2,-0.146259,-0.385638,0.151587,-0.100637,0.708131,-0.293650,-0.185487,-0.194703,-0.544218,0.540934,...,2.417250,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,1.412784,-0.407037,-0.170743
3,-0.146259,-0.385640,0.153750,-0.091813,1.842078,2.469529,-0.185487,-0.021724,-0.544218,-0.535847,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037,-0.170743
4,-0.146259,-0.385641,0.151587,-0.100637,-0.425816,-0.357657,-0.185487,-0.235888,-0.544218,-0.535847,...,1.001615,3.265442,1.337252,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037,-0.170743
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10157,-0.040083,-0.354761,-0.363156,0.075840,-0.425816,0.002162,-0.185487,-0.328555,-0.544218,-0.535847,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,1.706875,-0.169873,-0.396733,-0.407037,0.369809
10158,-0.040082,-0.354757,0.175378,-0.082989,0.708131,0.240396,-0.185487,1.065568,1.837498,-0.535847,...,-0.414020,-0.194408,1.337252,-0.114404,-0.083912,5.771376,-0.169873,-0.396733,2.646720,-0.189600
10159,-0.040082,-0.354759,0.173215,-0.091813,0.708131,7.138310,-0.185487,-0.202940,-0.544218,-0.535847,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037,-0.189600
10160,-0.040073,-0.354755,0.166727,-0.030046,-0.425816,-0.071838,-0.185487,-0.182347,1.837498,0.540934,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,1.119841,-0.168648


## Train-test split

In [6]:
X = X[features]
y = df['answer_score']

gss = GroupShuffleSplit(n_splits=2, train_size=0.8, random_state=0)
train_ix, val_ix = next(gss.split(X, y, groups=df['question_id']))
    
X_train = X.iloc[train_ix]
y_train = y.iloc[train_ix]

X_val = X.iloc[val_ix]
y_val = y.iloc[val_ix]

## Train SVM

In [7]:
svr = SVR(kernel='linear', verbose=1)
svr.fit(X_train, y_train)
filename = 'marisa_model_' + str(len(X_train)) + '.sav'
joblib.dump(svr, filename)

[LibSVM]

['marisa_model_8122.sav']

## Feature Importance

In [8]:
r = permutation_importance(svr, X_val, y_val, n_repeats=20, random_state=0)

for i in r.importances_mean.argsort()[::-1]:
    print(f"{features[i]:<8}"
    f"{r.importances_mean[i]:.3f}")

q_score 0.016
len_answer_text0.002
code_snippet_count0.001
link_count0.000
Negation0.000
entities_matches0.000
reputation0.000
Reasoning0.000
Please  0.000
Subjectivity0.000
Agreement0.000
Reassurance0.000
Informal.Title0.000
Affirmation0.000
Apology 0.000
Truth.Intensifier-0.000
Gratitude-0.000
Swearing-0.000
Bare.Command-0.000
reply_by_author-0.000
Negative.Emotion-0.000
WH.Questions-0.000
Positive.Emotion-0.000
Acknowledgement-0.000
Hedges  -0.000
YesNo.Questions-0.000


## Predict `answer_score`/`answer_rank`

In [9]:
y_pred = svr.predict(X_val)
print(y_pred)

[2.07527232 2.12686179 0.67224426 ... 0.74221588 0.90244155 2.36891252]


In [10]:
pred_df = df.iloc[val_ix][['question_id', 'answer_id', 'answer_score']]
pred_df['pred_answer_score'] = y_pred
pred_df['answer_rank'] = pred_df.groupby('question_id')['answer_score'].rank(ascending=False)
pred_df['pred_rank'] = pred_df.groupby('question_id')['pred_answer_score'].rank(ascending=False)
pred_df

Unnamed: 0,question_id,answer_id,answer_score,pred_answer_score,answer_rank,pred_rank
90,22577,13023680,1,2.075272,1.0,2.0
92,22577,23943,0,2.126862,2.5,1.0
93,22577,22637,0,0.672244,2.5,3.0
116,22590,22653,1,2.493831,1.5,1.0
117,22590,22641,0,-0.170185,3.5,4.0
...,...,...,...,...,...,...
10460,421772,421859,0,0.898276,4.5,5.0
10466,421782,422087,0,2.428879,3.0,1.0
10467,421782,421824,0,0.742216,3.0,4.0
10468,421782,421815,0,0.902442,3.0,3.0


## Evaluate model

**Spearman Correlation**

In [11]:
stats.spearmanr(pred_df['answer_rank'], pred_df['pred_rank'])

SpearmanrResult(correlation=0.6670816767191241, pvalue=7.227936122678671e-263)

**Of all questions, what proportion of best answers did we predict correctly?**

In [12]:
answer_rank = pred_df[pred_df['answer_rank'] == 1]
pred_rank_exact = pred_df[pred_df['pred_rank'] == 1]

In [13]:
len(answer_rank.merge(pred_rank_exact, how='inner', on='answer_id')) / len(pred_df['question_id'].unique())

0.43804034582132567

**Of all questions, what proportion of best answers did we predict within top 3?**

In [14]:
pred_df['top_3'] = pred_df['pred_rank'].apply(lambda row: int(row < 4))
pred_rank_top3 = pred_df[pred_df['top_3'] == 1]

In [15]:
len(answer_rank.merge(pred_rank_top3, how='inner', on='answer_id')) / len(pred_df['question_id'].unique())

0.7204610951008645

**Of all questions, what proportion of best answers are predicted correctly if the ranks are randomly assigned?**

In [18]:
random.seed(0)

num_answers = pred_df['question_id'].value_counts()
pred_df['rand_rank'] = pred_df['question_id'].apply(lambda x: random.randint(1, num_answers[x])) 

rand_rank = pred_df[pred_df['rand_rank'] == 1]

In [19]:
len(answer_rank.merge(rand_rank, how='inner', on='answer_id')) / len(pred_df['question_id'].unique())

0.3170028818443804