# Models

## Import dependencies

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.inspection import permutation_importance
from sklearn.model_selection import GroupShuffleSplit

## Import dataset

In [13]:
df = pd.read_csv("datasets/features_with_politeness.csv", encoding="ISO-8859-1").dropna()

## Explore dataset

In [14]:
print(df.shape)
df.head()

(10162, 48)


Unnamed: 0,question_id,answer_id,a_score_rel_q_score,answer_score,entities_matches,reputation,reply_by_author,len_answer_text,is_accepted,code_snippet_count,...,Subjectivity,Bare.Command,WH.Questions,YesNo.Questions,Gratitude,Apology,Truth.Intensifier,Affirmation,Adverb.Just,Conjunction.Start
0,22528,22594,-6,5,1,17497.0,False,628,False,0,...,0,0,0,0,0,0,0,0,0,0
1,22528,22587,2,13,0,17787.0,False,699,True,0,...,0,0,0,0,0,0,1,0,0,1
2,22528,22578,-11,0,1,21216.0,False,338,False,1,...,0,2,0,0,0,0,0,0,1,0
3,22528,22551,-10,1,2,480242.0,False,422,False,0,...,0,0,0,0,0,0,0,0,0,0
4,22528,22537,-11,0,0,10583.0,False,318,False,0,...,0,1,1,1,0,0,0,0,0,0


## Split train-test

In [15]:
# What is answer_score?

features = ['a_score_rel_q_score', 'answer_score',
       'entities_matches', 'reputation', 'reply_by_author', 'len_answer_text', 'code_snippet_count', 'link_count', 'Hedges',
       'Positive.Emotion', 'Negative.Emotion',
       'Swearing', 'Negation', 'Informal.Title', 'Reasoning', 'Reassurance', 'Please', 'Agreement', 'Acknowledgement',
       'Subjectivity', 'Bare.Command', 'WH.Questions', 'YesNo.Questions',
       'Gratitude', 'Apology', 'Truth.Intensifier', 'Affirmation']

In [16]:
scaler = StandardScaler()
scaler.fit(df)
X = scaler.transform(df)
X = pd.DataFrame(X,columns=df.columns)
X

Unnamed: 0,question_id,answer_id,a_score_rel_q_score,answer_score,entities_matches,reputation,reply_by_author,len_answer_text,is_accepted,code_snippet_count,...,Subjectivity,Bare.Command,WH.Questions,YesNo.Questions,Gratitude,Apology,Truth.Intensifier,Affirmation,Adverb.Just,Conjunction.Start
0,-0.146259,-0.385637,0.162401,-0.056518,0.708131,-0.316037,-0.185487,0.402484,-0.544218,-0.535847,...,-0.320186,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037
1,-0.146259,-0.385638,0.179703,0.014073,-0.425816,-0.314292,-0.185487,0.548692,1.837498,-0.535847,...,-0.320186,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,1.706875,-0.169873,-0.396733,1.119841
2,-0.146259,-0.385638,0.151587,-0.100637,0.708131,-0.293650,-0.185487,-0.194703,-0.544218,0.540934,...,-0.320186,2.417250,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,1.412784,-0.407037
3,-0.146259,-0.385640,0.153750,-0.091813,1.842078,2.469529,-0.185487,-0.021724,-0.544218,-0.535847,...,-0.320186,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037
4,-0.146259,-0.385641,0.151587,-0.100637,-0.425816,-0.357657,-0.185487,-0.235888,-0.544218,-0.535847,...,-0.320186,1.001615,3.265442,1.337252,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10157,-0.040083,-0.354761,-0.363156,0.075840,-0.425816,0.002162,-0.185487,-0.328555,-0.544218,-0.535847,...,2.126227,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,1.706875,-0.169873,-0.396733,-0.407037
10158,-0.040082,-0.354757,0.175378,-0.082989,0.708131,0.240396,-0.185487,1.065568,1.837498,-0.535847,...,4.572639,-0.414020,-0.194408,1.337252,-0.114404,-0.083912,5.771376,-0.169873,-0.396733,2.646720
10159,-0.040082,-0.354759,0.173215,-0.091813,0.708131,7.138310,-0.185487,-0.202940,-0.544218,-0.535847,...,2.126227,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037
10160,-0.040073,-0.354755,0.166727,-0.030046,-0.425816,-0.071838,-0.185487,-0.182347,1.837498,0.540934,...,-0.320186,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,1.119841


In [17]:
X = X[features]
y = df['answer_score']

gss = GroupShuffleSplit(n_splits=2, train_size=0.8, random_state=0)
train_ix, val_ix = next(gss.split(X, y, groups=df['question_id']))
    
X_train = X.iloc[train_ix]
y_train = y.iloc[train_ix]

X_val = X.iloc[val_ix]
y_val = y.iloc[val_ix]

## Train Random Forest

In [18]:
clf = RandomForestClassifier(random_state=0, n_estimators=1000)
clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000, random_state=0)

## Feature Importance

In [20]:
r = permutation_importance(clf, X_val, y_val, n_repeats=20, random_state=0)

for i in r.importances_mean.argsort()[::-1]:
    print(f"{features[i]:<8}"
    f"{r.importances_mean[i]:.3f}")

KeyboardInterrupt: 

## Predict is_accepted with Random Forest

In [22]:
y_pred = clf.predict(X_val)
print(y_pred)

[1 0 0 ... 0 0 7]


In [30]:
pred_df = df.iloc[val_ix][['question_id', 'answer_id', 'answer_score']]
pred_df['pred_answer_score'] = y_pred
pred_df['answer_rank'] = pred_df.groupby('question_id')['answer_score'].rank(ascending=False)
pred_df['pred_rank'] = pred_df.groupby('question_id')['pred_answer_score'].rank(ascending=False)
pred_df.head(25)

Unnamed: 0,question_id,answer_id,answer_score,pred_answer_score,answer_rank,pred_rank
90,22577,13023680,1,1,1.0,1.0
92,22577,23943,0,0,2.5,2.5
93,22577,22637,0,0,2.5,2.5
116,22590,22653,1,1,1.5,1.5
117,22590,22641,0,0,3.5,3.5
118,22590,22596,0,0,3.5,3.5
119,22590,22593,1,1,1.5,1.5
130,24644,24781,0,0,2.5,2.5
131,24644,24664,0,0,2.5,2.5
132,24644,24671,5,5,1.0,1.0


## Evaulate Random Forest Model

In [24]:
stats.spearmanr(pred_df['answer_rank'], pred_df['pred_rank'])

SpearmanrResult(correlation=0.9423676818704739, pvalue=0.0)

## Best answer in top 1

In [25]:
answer_rank = pred_df[pred_df['answer_rank'] == 1]
pred_rank_exact = pred_df[pred_df['pred_rank'] == 1]

In [26]:
len(answer_rank.merge(pred_rank_exact, how='inner', on='answer_id')) / len(pred_df['question_id'].unique())

0.7492795389048992

## Best answer in top 3 

In [27]:
pred_df['top_3'] = pred_df['pred_rank'].apply(lambda row: int(row < 4))
pred_rank_top3 = pred_df[pred_df['top_3'] == 1]

In [28]:
len(answer_rank.merge(pred_rank_top3, how='inner', on='answer_id')) / len(pred_df['question_id'].unique())

0.8530259365994236