In [None]:
from collections import Counter
import pandas as pd
import numpy as np
import spacy
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.dummy import DummyClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [None]:
nlp = spacy.load('en_core_web_sm')
dataset = pd.read_csv('datasets/essays.csv', encoding='latin-1')

transformation = { 'n': 0, 'y': 1 }

dataset = dataset.replace({'cEXT': transformation, 'cNEU': transformation, 'cAGR': transformation, 'cCON': transformation, 'cOPN': transformation})

traits = ['cEXT', 'cNEU', 'cAGR', 'cCON', 'cOPN']

In [None]:
dataset[(dataset['cOPN']==1) & (dataset['cAGR']==1)].count()

#AUTHID    686
TEXT       686
cEXT       686
cNEU       686
cAGR       686
cCON       686
cOPN       686
dtype: int64

In [None]:
print(f'Number of essays {len(dataset)}')

per_label_split = []

for trait in traits:
  negative_sum = sum(dataset[trait] == 0)
  positive_sum = sum(dataset[trait] == 1)
  total_number = len(dataset)

  per_label_split.append([trait, f'{negative_sum} ({100 * negative_sum / total_number:.2f}%)', f'{positive_sum} ({100 * positive_sum / total_number:.2f}%)'])

display(pd.DataFrame(per_label_split, columns=['Trait', 'Negative examples', 'Positive examples']))

Number of essays 2467


Unnamed: 0,Trait,Negative examples,Positive examples
0,cEXT,1191 (48.28%),1276 (51.72%)
1,cNEU,1234 (50.02%),1233 (49.98%)
2,cAGR,1157 (46.90%),1310 (53.10%)
3,cCON,1214 (49.21%),1253 (50.79%)
4,cOPN,1196 (48.48%),1271 (51.52%)


In [None]:
docs = [nlp(text) for text in dataset['TEXT']]

In [None]:
def extract_pos_tags_counts(doc, simple_tags=True):
  return Counter(token.pos_ if simple_tags else token.tag_ for token in doc)

def extract_named_entity_counts(doc):
  return Counter(entity.label_ for entity in doc.ents)

def extract_lemma_counts(doc):
  return Counter(token.lemma_ for token in doc)

def normalize_by_sum(features):
  total_sum = sum(features.values())

  return {key: value / total_sum for key, value in features.items()}

def normalize_by_constant(features, constant):
  return {key: value / constant for key, value in features.items()}

def prefix_keys(dictionary, prefix):
  return {f'{prefix}{key}': value for key, value in dictionary.items()}

def find_mean_counts():
  for t in traits:
    d_temp = dataset.loc[dataset[t] == 1]
    docs = [nlp(text) for text in d_temp['TEXT']]

    pos = [extract_named_entity_counts(d) for d in docs]
    cnt = Counter([])
    for c in pos:
      cnt = cnt + c
    cnt = dict(cnt)
    for k in cnt:
      cnt[k] = cnt[k] / len(dataset)
    print(t)
    print(cnt)

def t_test(pos_type):
  distributions = []
  for t in traits:
    d_temp = dataset.loc[dataset[t] == 1]
    docs = [nlp(text) for text in d_temp['TEXT']]

    pos = [extract_pos_tags_counts(d) for d in docs]
    dist = [p[pos_type] for p in pos]
    distributions.append(dist)

  print(pos_type)
  for i in range(len(distributions)):
    for j in range(i+1, len(distributions)):
      min_len = min(len(distributions[i]), len(distributions[j]))
      v1 = distributions[i][:min_len]
      v2 = distributions[j][:min_len]
      res = ttest_ind(v1, v2, equal_var=False)
      print(f'{traits[i]} vs {traits[j]}')
      print(res)

In [None]:
def evaluate_features_and_model(features, model_generator):
  information = []

  for trait in traits:
    X_train, X_test, y_train, y_test = train_test_split(features, dataset[trait], test_size=0.25, random_state=42, stratify=dataset[trait])

    model = model_generator()

    model.fit(X_train, y_train)

    pred = model.predict(X_test)
    acc = accuracy_score(y_test, pred)
    prf = precision_recall_fscore_support(y_test, pred, average='binary')
    information.append([trait, acc, *prf])

  score = pd.DataFrame(information, columns=['trait', 'accuracy', 'precision', 'recall', 'fscore', 'support'])

  return score['accuracy'].tolist() + score['fscore'].tolist()


In [None]:
model_generator = lambda: make_pipeline(DictVectorizer(sparse=False), SelectKBest(k=10), StandardScaler(), SVC(gamma='auto'))

In [None]:
features = {
    'Bag of words': [dict(extract_lemma_counts(doc)) for doc in docs],
    'PoS tag counts (simple, non-normalized)': [dict(extract_pos_tags_counts(doc)) for doc in docs],
    'PoS tag counts (detailed, non-normalized)': [dict(extract_pos_tags_counts(doc, False)) for doc in docs],
    'Named entity type counts (non-normalized)': [dict(extract_named_entity_counts(doc)) for doc in docs],
    'PoS + NE type counts (simple, non-normalized)': [{ **dict(extract_pos_tags_counts(doc)), **dict(extract_named_entity_counts(doc)) } for doc in docs],
    'PoS + NE type counts (detailed, non-normalized)': [{ **dict(extract_pos_tags_counts(doc, False)), **dict(extract_named_entity_counts(doc)) } for doc in docs],
    'PoS (basic, normalized by sum)': [normalize_by_sum(dict(extract_pos_tags_counts(doc))) for doc in docs],
    'PoS (detailed, normalized by sum)': [normalize_by_sum(dict(extract_pos_tags_counts(doc, False))) for doc in docs],
    'NE (normalized by sum)': [normalize_by_sum(dict(extract_named_entity_counts(doc))) for doc in docs],
    'PoS + NE (basic, normalized by sum)': [{ **normalize_by_sum(dict(extract_pos_tags_counts(doc))), **normalize_by_sum(dict(extract_named_entity_counts(doc))) } for doc in docs],
    'PoS + NE (detailed, normalized by sum)': [{ **normalize_by_sum(dict(extract_pos_tags_counts(doc, False))), **normalize_by_sum(dict(extract_named_entity_counts(doc))) } for doc in docs],
    'PoS (basic, normalized by sentence count)': [normalize_by_constant(dict(extract_pos_tags_counts(doc)), len(list(doc.sents))) for doc in docs],
    'PoS (detailed, normalized by sentence count)': [normalize_by_constant(dict(extract_pos_tags_counts(doc, False)), len(list(doc.sents))) for doc in docs],
    'NE (normalized by sentence count)': [normalize_by_constant(dict(extract_named_entity_counts(doc)), len(list(doc.sents))) for doc in docs],
    'PoS + NE (basic, normalized by sentence count)': [{ **normalize_by_constant(dict(extract_pos_tags_counts(doc)), len(list(doc.sents))), **normalize_by_constant(dict(extract_named_entity_counts(doc)), len(list(doc.sents))) } for doc in docs],
    'PoS + NE (detailed, normalized by sentence count)': [{ **normalize_by_constant(dict(extract_pos_tags_counts(doc, False)), len(list(doc.sents))), **normalize_by_constant(dict(extract_named_entity_counts(doc)), len(list(doc.sents))) } for doc in docs]
}
results = []

results.append(["Always true baseline", *evaluate_features_and_model([doc.vector for doc in docs], lambda: DummyClassifier(strategy='constant', constant=1))])
results.append(["Document vector", *evaluate_features_and_model([doc.vector for doc in docs], lambda: make_pipeline(StandardScaler(), SVC(gamma='auto')))])

for name, feature in features.items():
  results.append([name, *evaluate_features_and_model(feature, model_generator)])

score = pd.DataFrame(results, columns=['feature', *[f'{t}-{m}' for m in ['acc', 'f1'] for t in traits]])

print('# Accuracy and F1 scores for different features')
display(score)

best_features = []

for trait in traits:
  max_index = max(range(len(score)), key=lambda index: score[f'{trait}-acc'][index])

  best_features.append([trait, score['feature'][max_index], score[f'{trait}-f1'][max_index], score[f'{trait}-acc'][max_index]])

best_features_frame = pd.DataFrame(best_features, columns=['trait', 'feature', 'f1', 'acc'])

print('# Features with greatest accuracy for each trait')
display(best_features_frame)

# Accuracy and F1 scores for different features


Unnamed: 0,feature,cEXT-acc,cNEU-acc,cAGR-acc,cCON-acc,cOPN-acc,cEXT-f1,cNEU-f1,cAGR-f1,cCON-f1,cOPN-f1
0,Always true baseline,0.517018,0.49919,0.531605,0.507293,0.515397,0.681624,0.665946,0.69418,0.673118,0.680214
1,Document vector,0.534846,0.520259,0.512156,0.546191,0.615883,0.565809,0.514754,0.582524,0.574468,0.638168
2,Bag of words,0.536467,0.539708,0.507293,0.589951,0.594814,0.592593,0.455939,0.598945,0.570458,0.645892
3,"PoS tag counts (simple, non-normalized)",0.539708,0.513776,0.523501,0.515397,0.567261,0.62533,0.462366,0.641463,0.558346,0.575517
4,"PoS tag counts (detailed, non-normalized)",0.538088,0.529984,0.515397,0.507293,0.554295,0.614344,0.474638,0.639324,0.518987,0.518389
5,Named entity type counts (non-normalized),0.538088,0.502431,0.505673,0.528363,0.549433,0.624506,0.518053,0.644107,0.541732,0.618132
6,"PoS + NE type counts (simple, non-normalized)",0.555916,0.495948,0.491086,0.531605,0.606159,0.642298,0.461005,0.619855,0.544882,0.644217
7,"PoS + NE type counts (detailed, non-normalized)",0.52188,0.529984,0.523501,0.567261,0.570502,0.608234,0.474638,0.633416,0.588598,0.557596
8,"PoS (basic, normalized by sum)",0.546191,0.497569,0.529984,0.539708,0.567261,0.593023,0.498382,0.643735,0.606648,0.611354
9,"PoS (detailed, normalized by sum)",0.551053,0.523501,0.512156,0.487844,0.560778,0.632138,0.533333,0.576653,0.540698,0.557912


# Features with greatest accuracy for each trait


Unnamed: 0,trait,feature,f1,acc
0,cEXT,"PoS + NE (detailed, normalized by sum)",0.635488,0.570502
1,cNEU,Bag of words,0.455939,0.539708
2,cAGR,Always true baseline,0.69418,0.531605
3,cCON,Bag of words,0.570458,0.589951
4,cOPN,Document vector,0.638168,0.615883


In [None]:
best_features_frame.to_latex()

'\\begin{tabular}{lllrr}\n\\toprule\n{} & trait &                                 feature &        f1 &       acc \\\\\n\\midrule\n0 &  cEXT &  PoS + NE (detailed, normalized by sum) &  0.635488 &  0.570502 \\\\\n1 &  cNEU &                            Bag of words &  0.455939 &  0.539708 \\\\\n2 &  cAGR &                    Always true baseline &  0.694180 &  0.531605 \\\\\n3 &  cCON &                            Bag of words &  0.570458 &  0.589951 \\\\\n4 &  cOPN &                         Document vector &  0.638168 &  0.615883 \\\\\n\\bottomrule\n\\end{tabular}\n'

In [None]:
def grid_search_model_generator():
  pipeline = make_pipeline(DictVectorizer(sparse=False), SelectKBest(), StandardScaler(), SVC())

  param_grid = {
    'selectkbest__k': [5, 10],
    'svc__C': np.logspace(-1, 1, 3),
    'svc__kernel': ['linear', 'poly', 'rbf'],
    'svc__gamma': ['auto', 'scale']
  }

  search = GridSearchCV(pipeline, param_grid, n_jobs=-1, verbose=3)

  return search

feature = [{
           **dict(extract_lemma_counts(doc)),
           **prefix_keys({ **dict(extract_pos_tags_counts(doc, False)), **dict(extract_named_entity_counts(doc)) }, 'unnormalized_'),
           **prefix_keys({ **normalize_by_sum(dict(extract_pos_tags_counts(doc, False))), **normalize_by_sum(dict(extract_named_entity_counts(doc))) }, 'percentage_'),
           **prefix_keys({ **normalize_by_constant(dict(extract_pos_tags_counts(doc, False)), len(list(doc.sents))), **normalize_by_constant(dict(extract_named_entity_counts(doc)), len(list(doc.sents))) }, 'mean_per_sentence_') } for doc in docs]

new_features = []
for trait in traits:
  dict2vec = DictVectorizer().fit(feature, dataset[trait])
  kbest = SelectKBest().fit(dict2vec.transform(feature), dataset[trait])
  
  names = dict2vec.feature_names_
  scores = kbest.scores_
  pvalues = kbest.pvalues_

  new_features.append([{k: f[k] for k in names[:20] if k in f} for f in feature])

  print(trait)
  display(pd.DataFrame({
      'name': names,
      'score': scores,
      'pvalue': pvalues
  }).sort_values(by=['score', 'pvalue'], ascending=[False, True]).head(20))

results = []
for i, feature in enumerate(new_features):
  X_train, X_test, y_train, y_test = train_test_split(feature, dataset[traits[i]], test_size=0.25, random_state=42, stratify=dataset[traits[i]])

  model = model_generator()
  model.fit(X_train, y_train)

  pred = model.predict(X_test)
  acc = accuracy_score(y_test, pred)
  prf = precision_recall_fscore_support(y_test, pred, average='binary')
  results.append([traits[i], acc, prf[2]])

score = pd.DataFrame(results, columns=['feature', 'acc', 'f1'])

print('# Accuracy and F1 scores for best features')
display(score)

#display(evaluate_features_and_model(
#    feature, 
#    grid_search_model_generator
#))

cEXT


Unnamed: 0,name,score,pvalue
26086,sorority,23.19031,2e-06
15512,fun,19.663663,1e-05
22361,perhaps,19.204119,1.2e-05
10017,boyfriend,16.167508,6e-05
19862,mean_per_sentence_PRP,15.927449,6.8e-05
19877,mean_per_sentence_VBP,13.982336,0.000189
25857,so,13.937567,0.000193
22276,percentage_.,12.484425,0.000418
8205,all,12.408623,0.000435
15678,generally,12.312552,0.000458


cNEU


Unnamed: 0,name,score,pvalue
21083,not,22.697085,2e-06
9410,beat,21.763796,3e-06
19832,mean_per_sentence_DT,20.41829,7e-06
24873,scared,20.393448,7e-06
17127,hurt,18.216863,2e-05
28899,unnormalized_VBP,17.85653,2.5e-05
14340,everything,16.608041,4.7e-05
22284,percentage_DT,16.477806,5.1e-05
16449,hate,16.157224,6e-05
29505,want,16.107376,6.2e-05


cAGR


Unnamed: 0,name,score,pvalue
14700,family,17.925416,2.4e-05
26755,stupid,15.150077,0.000102
30140,worried,14.65419,0.000132
20997,no,13.635758,0.000227
21100,nothing,12.694646,0.000374
29559,waste,12.568196,0.0004
26843,suck,11.874212,0.000579
15483,fucking,11.7892,0.000606
15177,fool,11.416183,0.000739
15479,fuck,11.27956,0.000796


cCON


Unnamed: 0,name,score,pvalue
28853,unnormalized_DATE,30.151966,4.403754e-08
19831,mean_per_sentence_DATE,24.36989,8.481386e-07
22283,percentage_DATE,23.691959,1.201977e-06
29638,week,17.971624,2.32479e-05
26721,student,17.334195,3.242611e-05
28047,tonight,16.27193,5.653954e-05
7650,able,15.83704,7.103108e-05
27975,today,15.001619,0.0001102155
10767,chance,14.144622,0.0001732208
22323,percentage_TO,13.709888,0.0002180088


cOPN


Unnamed: 0,name,score,pvalue
19831,mean_per_sentence_DATE,45.286764,2.109144e-11
15854,go,40.850357,1.958773e-10
22323,percentage_TO,39.779853,3.358289e-10
28853,unnormalized_DATE,38.105557,7.812635e-10
16834,home,35.550844,2.841205e-09
11053,class,35.023902,3.709766e-09
16862,homework,30.515209,3.660027e-08
22311,percentage_PERSON,30.161947,4.381422e-08
12091,create,28.816479,8.700379e-08
22274,percentage_-LRB-,28.16888,1.210981e-07


# Accuracy and F1 scores for best features


Unnamed: 0,feature,acc,f1
0,cEXT,0.515397,0.678149
1,cNEU,0.508914,0.183288
2,cAGR,0.534846,0.690399
3,cCON,0.513776,0.630542
4,cOPN,0.520259,0.680346
