# Models

## Import dependencies

In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from scipy import stats
from sklearn.inspection import permutation_importance
from sklearn.model_selection import GroupShuffleSplit

## Import dataset

In [7]:
df = pd.read_csv("datasets/features_with_politeness.csv", encoding="ISO-8859-1").dropna()

## Explore dataset

In [8]:
print(df.shape)
df.head()

(10162, 48)


Unnamed: 0,question_id,answer_id,a_score_rel_q_score,answer_score,entities_matches,reputation,reply_by_author,len_answer_text,is_accepted,code_snippet_count,...,Subjectivity,Bare.Command,WH.Questions,YesNo.Questions,Gratitude,Apology,Truth.Intensifier,Affirmation,Adverb.Just,Conjunction.Start
0,22528,22594,-6,5,1,17497.0,False,628,False,0,...,0,0,0,0,0,0,0,0,0,0
1,22528,22587,2,13,0,17787.0,False,699,True,0,...,0,0,0,0,0,0,1,0,0,1
2,22528,22578,-11,0,1,21216.0,False,338,False,1,...,0,2,0,0,0,0,0,0,1,0
3,22528,22551,-10,1,2,480242.0,False,422,False,0,...,0,0,0,0,0,0,0,0,0,0
4,22528,22537,-11,0,0,10583.0,False,318,False,0,...,0,1,1,1,0,0,0,0,0,0


## Split train-test

In [9]:
# What is answer_score?

features = ['a_score_rel_q_score', 'answer_score',
       'entities_matches', 'reputation', 'reply_by_author', 'len_answer_text', 'code_snippet_count', 'link_count', 'Hedges',
       'Positive.Emotion', 'Negative.Emotion',
       'Swearing', 'Negation', 'Informal.Title', 'Reasoning', 'Reassurance', 'Please', 'Agreement', 'Acknowledgement',
       'Subjectivity', 'Bare.Command', 'WH.Questions', 'YesNo.Questions',
       'Gratitude', 'Apology', 'Truth.Intensifier', 'Affirmation']

In [10]:
scaler = StandardScaler()
scaler.fit(df)
X = scaler.transform(df)
X = pd.DataFrame(X,columns=df.columns)
X

Unnamed: 0,question_id,answer_id,a_score_rel_q_score,answer_score,entities_matches,reputation,reply_by_author,len_answer_text,is_accepted,code_snippet_count,...,Subjectivity,Bare.Command,WH.Questions,YesNo.Questions,Gratitude,Apology,Truth.Intensifier,Affirmation,Adverb.Just,Conjunction.Start
0,-0.146259,-0.385637,0.162401,-0.056518,0.708131,-0.316037,-0.185487,0.402484,-0.544218,-0.535847,...,-0.320186,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037
1,-0.146259,-0.385638,0.179703,0.014073,-0.425816,-0.314292,-0.185487,0.548692,1.837498,-0.535847,...,-0.320186,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,1.706875,-0.169873,-0.396733,1.119841
2,-0.146259,-0.385638,0.151587,-0.100637,0.708131,-0.293650,-0.185487,-0.194703,-0.544218,0.540934,...,-0.320186,2.417250,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,1.412784,-0.407037
3,-0.146259,-0.385640,0.153750,-0.091813,1.842078,2.469529,-0.185487,-0.021724,-0.544218,-0.535847,...,-0.320186,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037
4,-0.146259,-0.385641,0.151587,-0.100637,-0.425816,-0.357657,-0.185487,-0.235888,-0.544218,-0.535847,...,-0.320186,1.001615,3.265442,1.337252,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10157,-0.040083,-0.354761,-0.363156,0.075840,-0.425816,0.002162,-0.185487,-0.328555,-0.544218,-0.535847,...,2.126227,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,1.706875,-0.169873,-0.396733,-0.407037
10158,-0.040082,-0.354757,0.175378,-0.082989,0.708131,0.240396,-0.185487,1.065568,1.837498,-0.535847,...,4.572639,-0.414020,-0.194408,1.337252,-0.114404,-0.083912,5.771376,-0.169873,-0.396733,2.646720
10159,-0.040082,-0.354759,0.173215,-0.091813,0.708131,7.138310,-0.185487,-0.202940,-0.544218,-0.535847,...,2.126227,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,-0.407037
10160,-0.040073,-0.354755,0.166727,-0.030046,-0.425816,-0.071838,-0.185487,-0.182347,1.837498,0.540934,...,-0.320186,-0.414020,-0.194408,-0.276662,-0.114404,-0.083912,-0.325376,-0.169873,-0.396733,1.119841


In [11]:
X = X[features]
y = df['answer_score']

gss = GroupShuffleSplit(n_splits=2, train_size=0.8, random_state=0)
train_ix, val_ix = next(gss.split(X, y, groups=df['question_id']))
    
X_train = X.iloc[train_ix]
y_train = y.iloc[train_ix]

X_val = X.iloc[val_ix]
y_val = y.iloc[val_ix]

NameError: name 'GroupShuffleSplit' is not defined

## Train Random Forest

In [12]:
clf = RandomForestClassifier(random_state=0, n_estimators=1000)
clf.fit(X_train, y_train)

RandomForestClassifier(n_estimators=1000, random_state=0)

## Feature Importance

In [None]:
r = permutation_importance(svr, X_val, y_val, n_repeats=20, random_state=0)

for i in r.importances_mean.argsort()[::-1]:
    print(f"{features[i]:<8}"
    f"{r.importances_mean[i]:.3f}")

## Predict is_accepted with Random Forest

In [13]:
y_pred2 = clf.predict(rfX_test)
print(y_pred2)

[3. 6. 2. ... 5. 2. 2.]
1365    5.0
2384    7.0
9301    3.0
4420    7.0
5166    1.0
       ... 
1887    3.0
5482    2.0
5737    7.0
4705    3.0
5085    3.0
Name: rank, Length: 2033, dtype: float64


In [None]:
pred_df = df.iloc[val_ix][['question_id', 'answer_id', 'answer_score']]
pred_df['pred_answer_score'] = y_pred
pred_df['answer_rank'] = pred_df.groupby('question_id')['answer_score'].rank(ascending=False)
pred_df['pred_rank'] = pred_df.groupby('question_id')['pred_answer_score'].rank(ascending=False)
pred_df

## Evaulate Random Forest Model

In [14]:
print(classification_report(list(rfy_test), list(y_pred2)))

              precision    recall  f1-score   support

         1.0       0.74      0.82      0.78       523
         2.0       0.52      0.62      0.56       505
         3.0       0.38      0.40      0.39       305
         4.0       0.25      0.22      0.24       212
         5.0       0.23      0.18      0.20       130
         6.0       0.22      0.16      0.19        85
         7.0       0.16      0.11      0.13        61
         8.0       0.13      0.10      0.11        40
         9.0       0.19      0.16      0.17        37
        10.0       0.34      0.31      0.33        35
        11.0       0.24      0.18      0.20        28
        12.0       0.00      0.00      0.00        18
        13.0       0.12      0.10      0.11        10
        14.0       0.00      0.00      0.00         5
        15.0       1.00      0.07      0.13        14
        16.0       0.00      0.00      0.00         7
        17.0       1.00      0.20      0.33         5
        18.0       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
