# Ensemble Model

## Import dependencies

In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupShuffleSplit
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.inspection import permutation_importance

from scipy import stats
from scipy.stats import mstats

## Import dataset

In [2]:
df = pd.read_csv("datasets/features_with_politeness.csv", lineterminator='\n', encoding="ISO-8859-1").dropna()
df['q_score'] = df['answer_score'] - df['a_score_rel_q_score']
df.head()
print(len(df))

10162


In [3]:
features = [
       'entities_matches', 'reputation', 'reply_by_author', 'len_answer_text',
       'is_accepted', 'code_snippet_count', 'link_count', 'Hedges',
       'Positive.Emotion', 'Negative.Emotion', 'Impersonal.Pronoun',
       'Swearing', 'Negation', 'Filler.Pause', 'Informal.Title',
       'Formal.Title', 'Could.You', 'Can.You', 'By.The.Way', 'Let.Me.Know',
       'Goodbye', 'For.Me', 'For.You', 'Reasoning', 'Reassurance',
       'Ask.Agency', 'Give.Agency', 'Hello', 'Please', 'First.Person.Plural',
       'First.Person.Single', 'Second.Person', 'Agreement', 'Acknowledgement',
       'Subjectivity', 'Bare.Command', 'WH.Questions', 'YesNo.Questions',
       'Gratitude', 'Apology', 'Truth.Intensifier', 'Affirmation',
       'Adverb.Just', 'Conjunction.Start', 'q_score']

len(features)

45

In [4]:
scaler = StandardScaler()
scaler.fit(df[features])
X = scaler.transform(df[features])
X = pd.DataFrame(X,columns=features)
y = df['answer_score']

In [5]:
X

Unnamed: 0,entities_matches,reputation,reply_by_author,len_answer_text,is_accepted,code_snippet_count,link_count,Hedges,Positive.Emotion,Negative.Emotion,...,Bare.Command,WH.Questions,YesNo.Questions,Gratitude,Apology,Truth.Intensifier,Affirmation,Adverb.Just,Conjunction.Start,q_score
0,0.708131,-0.316037,-0.185487,0.402484,-0.544218,-0.535847,-0.462886,0.00201,-0.388082,1.185952,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037,-0.170743
1,-0.425816,-0.314292,-0.185487,0.548692,1.837498,-0.535847,0.286970,-0.63629,0.551563,-0.228104,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,1.706875,-0.169873,-0.396733,1.119841,-0.170743
2,0.708131,-0.293650,-0.185487,-0.194703,-0.544218,0.540934,-0.462886,1.91691,0.864778,0.125410,...,2.417250,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,1.412784,-0.407037,-0.170743
3,1.842078,2.469529,-0.185487,-0.021724,-0.544218,-0.535847,-0.462886,0.00201,0.551563,-0.581618,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037,-0.170743
4,-0.425816,-0.357657,-0.185487,-0.235888,-0.544218,-0.535847,-0.462886,-0.63629,-0.701297,-0.581618,...,1.001615,3.265442,1.337252,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037,-0.170743
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10157,-0.425816,0.002162,-0.185487,-0.328555,-0.544218,-0.535847,-0.462886,-0.63629,-0.701297,-0.228104,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,1.706875,-0.169873,-0.396733,-0.407037,0.369809
10158,0.708131,0.240396,-0.185487,1.065568,1.837498,-0.535847,0.286970,0.64031,1.491207,0.478924,...,-0.414020,-0.194408,1.337252,-0.114404,-0.083912,5.771376,-0.169873,-0.396733,2.646720,-0.189600
10159,0.708131,7.138310,-0.185487,-0.202940,-0.544218,-0.535847,-0.462886,0.64031,-0.701297,0.832438,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037,-0.189600
10160,-0.425816,-0.071838,-0.185487,-0.182347,1.837498,0.540934,0.286970,-0.63629,-0.388082,-0.581618,...,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,1.119841,-0.168648


In [6]:
y

0         5
1        13
2         0
3         1
4         0
         ..
10482    20
10483     2
10484     1
10485     8
10486     1
Name: answer_score, Length: 10162, dtype: int64

## Train-test split

In [7]:
gss = GroupShuffleSplit(n_splits=2, train_size=0.8, random_state=0)
train_ix, val_ix = next(gss.split(X, y, groups=df['question_id']))
    
X_train = X.iloc[train_ix]
y_train = y.iloc[train_ix]

X_val = X.iloc[val_ix]
y_val = y.iloc[val_ix]

## Model 1 (SVM)

In [8]:
svr = SVR(kernel='linear')
svr.fit(X_train, y_train)

filename = 'models/svm_model_' + str(len(X_train)) + '.sav'
joblib.dump(svr, filename)

svm_pred = svr.predict(X_val)
print(svm_pred)

[1.25798782 1.38968789 3.08437483 ... 0.27590428 0.79686186 4.60247526]


## Model 2 (Linear Regression)

In [9]:
reg = LinearRegression().fit(X_train, y_train)

filename = 'models/reg_model_' + str(len(X_train)) + '.sav'
joblib.dump(reg, filename)

reg_pred = reg.predict(X_val)
print(reg_pred)

[-1.52242113  1.69725951 11.05846245 ... -7.05887171 -9.51974177
 23.90525346]


## Model 3 (Random Forest)

In [10]:
rf = RandomForestClassifier(random_state=0, n_estimators=1000)
rf.fit(X_train, y_train)

filename = 'models/rf_model_' + str(len(X_train)) + '.sav'
joblib.dump(rf, filename)

rf_pred = rf.predict(X_val)
print(rf_pred)

[0 0 1 ... 0 0 1]


## Feature Importance

In [11]:
# # Model 1
# r = permutation_importance(svr, X_val, y_val, n_repeats=20, random_state=0)

# for i in r.importances_mean.argsort()[::-1]:
#     print(f"{features[i]:<8}"
#     f"{r.importances_mean[i]:.3f}")
    
# # Model 2
# r = permutation_importance(reg, X_val, y_val, n_repeats=20, random_state=0)

# for i in r.importances_mean.argsort()[::-1]:
#     print(f"{features[i]:<8}"
#     f"{r.importances_mean[i]:.3f}")
   
# # Model 3
# r = permutation_importance(reg, X_val, y_val, n_repeats=20, random_state=0)

# for i in r.importances_mean.argsort()[::-1]:
#     print(f"{features[i]:<8}"
#     f"{r.importances_mean[i]:.3f}")

## Evaluating Models

In [32]:
# Add predictions to pred_df
pred_df = df.iloc[val_ix][['question_id', 'answer_id', 'answer_score']]
pred_df['svm_pred_answer_score'] = svm_pred
pred_df['reg_pred_answer_score'] = reg_pred
pred_df['rf_pred_answer_score'] = rf_pred

# Convert predicted answer_score to predicted rankings
pred_df['answer_rank'] = pred_df.groupby('question_id')['answer_score'].rank(ascending=False)
pred_df['svm_pred_rank'] = pred_df.groupby('question_id')['svm_pred_answer_score'].rank(ascending=False)
pred_df['reg_pred_rank'] = pred_df.groupby('question_id')['reg_pred_answer_score'].rank(ascending=False)
pred_df['rf_pred_rank'] = pred_df.groupby('question_id')['rf_pred_answer_score'].rank(ascending=False)

In [134]:
%%capture --no-stdout

def get_metric(pred_answer_score, answer_score):
    return mstats.spearmanr(pred_answer_score, answer_score, use_ties=True).correlation

models = {'svm_pred_answer_score': [], 'reg_pred_answer_score': [], 'rf_pred_answer_score': []}
for model_answer_score in models.keys():
    for question_id in pred_df['question_id'].unique():
        mask = (pred_df['question_id'] == question_id)
        if len(pred_df[mask]) > 1:
            corr = mstats.spearmanr(pred_df[mask][model_answer_score].to_list(), pred_df[mask]['answer_score'].to_list(), use_ties=True).correlation
            models[model_answer_score].append(corr)
            
for key, val in models.items():
    print(key, "Avg Correlation Per Question:", sum(val) / len(val))

svm_pred_answer_score Avg Correlation Per Question: 0.4913708143219172
reg_pred_answer_score Avg Correlation Per Question: 0.4315530314889336
rf_pred_answer_score Avg Correlation Per Question: 0.4499857383469573


**Of all questions, what proportion of best answers did we predict correctly?**

In [133]:
answer_rank = pred_df[pred_df['answer_rank'] == 1]
svm_pred_rank_exact = pred_df[pred_df['svm_pred_rank'] == 1]
reg_pred_rank_exact = pred_df[pred_df['reg_pred_rank'] == 1]
rf_pred_rank_exact = pred_df[pred_df['rf_pred_rank'] == 1]
print('svm_pred_rank_exact', len(answer_rank.merge(svm_pred_rank_exact, how='inner', on='answer_id')) / len(pred_df['question_id'].unique()))
print('reg_pred_rank_exact', len(answer_rank.merge(reg_pred_rank_exact, how='inner', on='answer_id')) / len(pred_df['question_id'].unique()))
print('rf_pred_rank_exact', len(answer_rank.merge(rf_pred_rank_exact, how='inner', on='answer_id')) / len(pred_df['question_id'].unique()))

svm_pred_rank_exact 0.6484149855907781
reg_pred_rank_exact 0.6051873198847262
rf_pred_rank_exact 0.590778097982709


**Of all questions, what proportion of best answers did we predict within top 3?**

In [137]:
pred_df['svm_top_3'] = pred_df['svm_pred_rank'].apply(lambda row: int(row < 4))
pred_df['reg_top_3'] = pred_df['reg_pred_rank'].apply(lambda row: int(row < 4))
pred_df['rf_top_3'] = pred_df['rf_pred_rank'].apply(lambda row: int(row < 4))

print('svm_pred_rank_exact', len(answer_rank.merge(pred_df[pred_df['svm_top_3'] == 1], how='inner', on='answer_id')) / len(pred_df['question_id'].unique()))
print('reg_pred_rank_exact', len(answer_rank.merge(pred_df[pred_df['reg_top_3'] == 1], how='inner', on='answer_id')) / len(pred_df['question_id'].unique()))
print('rf_pred_rank_exact', len(answer_rank.merge(pred_df[pred_df['rf_top_3'] == 1], how='inner', on='answer_id')) / len(pred_df['question_id'].unique()))

svm_pred_rank_exact 0.8357348703170029
reg_pred_rank_exact 0.8357348703170029
rf_pred_rank_exact 0.8155619596541787
