In [None]:
#!apt install -qq enchant
#!pip install pyenchant
#!pip install pyspellchecker
#!pip install transformers
#!pip install conllu
#!pip install ufal.udpipe

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from gensim.models import KeyedVectors
from transformers import BertModel, BertTokenizer, GPT2Model, GPT2Tokenizer
from tqdm import tqdm_notebook
from math import ceil

import re
import os
import html
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import pickle

import nltk
#nltk.download('punkt')
from nltk import word_tokenize

from google.colab import drive

In [None]:
!pip uninstall scikit-learn
!pip install scikit-learn==0.21.2

Found existing installation: scikit-learn 0.22.2.post1
Uninstalling scikit-learn-0.22.2.post1:
  Would remove:
    /usr/local/lib/python3.7/dist-packages/scikit_learn-0.22.2.post1.dist-info/*
    /usr/local/lib/python3.7/dist-packages/sklearn/*
Proceed (y/n)? y
  Successfully uninstalled scikit-learn-0.22.2.post1
Collecting scikit-learn==0.21.2
  Downloading scikit_learn-0.21.2-cp37-cp37m-manylinux1_x86_64.whl (6.7 MB)
[K     |████████████████████████████████| 6.7 MB 2.7 MB/s 
Installing collected packages: scikit-learn
Successfully installed scikit-learn-0.21.2


In [None]:
!git clone https://github.com/lcl-hse/lab_inspector.git

Cloning into 'lab_inspector'...
remote: Enumerating objects: 21918, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 21918 (delta 5), reused 27 (delta 2), pack-reused 21884[K
Receiving objects: 100% (21918/21918), 93.60 MiB | 23.07 MiB/s, done.
Resolving deltas: 100% (5494/5494), done.
Checking out files: 100% (26120/26120), done.


In [None]:
os.listdir('lab_inspector')

['R', 'Inspector', 'README.md', '.git']

In [None]:
class MyBatchIterator:
  def __init__(self, texts, batch_size):
    self.texts = texts
    self.batch_size = batch_size
  
  def __iter__(self):
    self.start = 0
    return self
  
  def __next__(self):
    if self.start >= len(self.texts):
      raise StopIteration
    batch = self.texts[self.start:self.start+self.batch_size]
    self.start += self.batch_size
    return batch
  
  def __len__(self):
    return ceil(len(self.texts)/self.batch_size)

class BERTEmbedder:
  def __init__(self, model_name):
    self.tokenizer = BertTokenizer.from_pretrained(model_name)
    self.model = BertModel.from_pretrained(model_name)
  
  def process(self, texts, flatten_method='pooler'):
    tokenized = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
    ids = tokenized['input_ids']
    mask = tokenized['attention_mask']
    processed = self.model(input_ids=ids, attention_mask=mask)

    if flatten_method == 'average':
      return processed['last_hidden_state'].detach().numpy().mean(axis=1)
    elif flatten_method == 'pooler':
      return processed['pooler_output'].detach().numpy()
  
  def process_sample(self, texts, batch_size=4, flatten_method='pooler'):
    text_iter = MyBatchIterator(texts, batch_size=batch_size)
    batches = []

    for batch in tqdm_notebook(text_iter, total=len(text_iter)):
      batches.append(self.process(batch, flatten_method=flatten_method))
    
    return np.concatenate(batches, axis=0)

class GPT2Embedder(BERTEmbedder):
  def __init__(self, model_name):
    self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    self.tokenizer.pad_token = self.tokenizer.eos_token
    self.model = GPT2Model.from_pretrained(model_name)
  
  def process(self, texts, flatten_method='pooler'):
    tokenized = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
    ids = tokenized['input_ids']
    mask = tokenized['attention_mask']
    processed = self.model(input_ids=ids, attention_mask=mask)

    if flatten_method == 'average':
      return processed['last_hidden_state'].detach().numpy().mean(axis=1)
    elif flatten_method == 'pooler':
      return processed['pooler_output'].detach().numpy()

In [None]:
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
zero_level = pd.read_csv("/content/drive/My Drive/QuestionLevelPrediction/ZeroLevel.csv")

In [None]:
zero_level

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,Sentence (original),Sentence (changed),Right answer,Error type,Error type+,Level (Hand),path_in_REALEC,Sentence,Error span,LevelAuto
0,4,6,50904,If in North Africa and South Asia there <b>is...,If in North Africa and South Asia there <b>is...,were,Agreement_errors //Tense_choice,Agreement_errors,3,https://realec.org/index.xhtml#/exam/Exam2017/...,If in North Africa and South Asia there <b>is...,is,0
1,16,19,39505,"Overall, the chart gives information that aro...","Overall, the chart gives information that aro...",a bigger proportion,Comparison_degree,Comparison_degree,3,https://realec.org/index.xhtml#/exam/Exam2016/...,"Overall, the chart gives information that aro...",the biggest proportion,0
2,18,21,17497,The situation in Africa <b>is decreased</b> a...,The situation in Africa <b>was decreased</b> ...,worsened,Voice //lex_item_choice,Voice,3,https://realec.org/index.xhtml#/exam/Exam2016/...,The situation in Africa <b>was decreased</b> ...,was decreased,0
3,20,23,69626,"But on the other hand, there are a lot of <b>...",,people who,Relative_clause,Relative_clause,3,https://realec.org/index.xhtml#/exam/Exam2017/...,"But on the other hand, there are a lot of <b>...","people, who",0
4,25,29,56370,<b>Exploration the space</b> can lead to the ...,<b>Exploration the space</b> can lead to new ...,Space exploration,Word_order //Article,Word_order,3,https://realec.org/index.xhtml#/exam/Exam2017/...,<b>Exploration the space</b> can lead to new ...,Exploration the space,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
108,718,813,55758,The most significant difference took place fo...,The most significant difference was registere...,three times as often as,Comparative_constr,Tense_choice,3,https://realec.org/index.xhtml#/exam/Exam2017/...,The most significant difference was registere...,three times more often than,0
109,730,826,4617,<b>USA's growth of elderly people percentage<...,,The growth in the percentage of elderly people...,Structure_confusion,Word_order,3,https://realec.org/index.xhtml#/exam/Exam2014/...,<b>USA's growth of elderly people percentage<...,USA's growth of elderly people percentage,0
110,737,834,79712,The given pie charts present the major reasons...,The given pie charts present the major reasons...,a ten-year period,Structure_confusion //Article,Possessive,3,https://realec.org/index.xhtml#/exam/Exam2019/...,The given pie charts present the major reasons...,the ten year's period,0
111,741,838,80835,"Firstly, I believe that space could <b>wait</...","Firstly, I believe that space could wait <b>a...",until,Linking_device OR Conjunctions,Prepositions,3,https://realec.org/index.xhtml#/exam/Exam2017/...,"Firstly, I believe that space could wait <b>a...","a time, when",0


In [None]:
with open('drive/My Drive/QuestionLevelPrediction/BertEmbedder.pickle','rb') as inp:
  bert_embedder = pickle.load(inp)

with open('drive/My Drive/QuestionLevelPrediction/GPT2Embedder.pickle','rb') as inp:
  gpt2_embedder = pickle.load(inp)

with open('drive/My Drive/QuestionLevelPrediction/BernoulliBERT.pickle','rb') as inp:
  bernoulli_bert = pickle.load(inp)

with open('drive/My Drive/QuestionLevelPrediction/BernoulliGPT2.pickle','rb') as inp:
  bernoulli_gpt2 = pickle.load(inp)

In [None]:
with open('drive/My Drive/QuestionLevelPrediction/InspectorTree.pickle','rb') as inp:
  inspector_tree = pickle.load(inp)

In [None]:
X = zero_level["Sentence"].tolist()
y = zero_level["Level (Hand)"].apply(lambda x: f"adpq_{x}")

In [None]:
X_bert = bert_embedder.process_sample(X, flatten_method="average")
X_gpt2 = gpt2_embedder.process_sample(X, flatten_method="average")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/29 [00:00<?, ?it/s]

  0%|          | 0/29 [00:00<?, ?it/s]

In [None]:
y_pred_bert = bernoulli_bert.predict(X_bert)
y_pred_gpt2 = bernoulli_gpt2.predict(X_gpt2)

In [None]:
print(classification_report(y, y_pred_bert))

              precision    recall  f1-score   support

      adpq_1       0.00      0.00      0.00         1
      adpq_2       0.20      0.45      0.28        22
      adpq_3       0.80      0.48      0.60        90

    accuracy                           0.47       113
   macro avg       0.33      0.31      0.29       113
weighted avg       0.67      0.47      0.53       113



In [None]:
print(classification_report(y, y_pred_gpt2))

              precision    recall  f1-score   support

      adpq_1       0.00      0.00      0.00         1
      adpq_2       0.16      0.27      0.20        22
      adpq_3       0.71      0.41      0.52        90

    accuracy                           0.38       113
   macro avg       0.29      0.23      0.24       113
weighted avg       0.60      0.38      0.45       113



In [None]:
os.chdir("lab_inspector/Inspector")

In [None]:
from main import main as inspect

In [None]:
X_inspector = []

for question in tqdm_notebook(X, total=len(X)):
  X_inspector.append(inspect(question))

X_inspector = pd.DataFrame(X_inspector, index=zero_level.index)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/113 [00:00<?, ?it/s]

In [None]:
X_inspector.head()

Unnamed: 0,av_depth,max_depth,min_depth,num_acl,num_rel_cl,num_advcl,num_sent,num_tok,av_tok_before_root,av_len_sent,num_cl,num_tu,num_compl_tu,num_coord,num_poss,num_prep,num_adj_noun,num_part_noun,num_noun_inf,pos_sim_nei,lemma_sim_nei,pos_sim_all,lemma_sim_all,density,ls,vs,corrected_vs,squared_vs,lfp_1000,lfp_2000,lfp_uwl,lfp_rest,ndw,ttr,corrected_ttr,root_ttr,log_ttr,uber_ttr,d,lv,vvi,squared_vv,corrected_vv,vvii,nv,adjv,advv,modv,der_level3,der_level4,der_level5,der_level6,mci,freq_finite_forms,freq_aux,num_inf,num_gerunds,num_pres_sing,num_pres_plur,num_past_part,num_past_simple,num_linkings,num_4grams,num_func_ngrams,num_shell_noun,num_misspelled_tokens,punct_mistakes_pp,punct_mistakes_because,punct_mistakes_but,punct_mistakes_compare,million_mistake,side_mistake
0,8.0,8,8,0,0,0,1,30,28.0,30.0,1,1,0,2,2,0,1,0,0,0.0,0.0,0.0,0.0,0.533333,0.5,0.0,0.0,0.0,0.566667,0.1,0.0,0.333333,25,0.833333,3.227486,4.564355,0.946395,0.0,0.0,0.833333,0.0,0.0,0.0,0.0,0.266667,0.0625,0.125,0.1875,0.0,0.25,0.0,0.0,-1.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0
1,8.0,8,8,3,1,0,1,35,4.0,35.0,4,3,1,0,1,2,2,1,1,0.0,0.0,0.0,0.0,0.457143,0.1875,0.0,0.0,0.0,0.657143,0.085714,0.028571,0.228571,33,0.942857,3.944254,5.578018,0.98345,0.0,0.0,0.942857,1.0,4.0,1.414214,0.25,0.285714,0.125,0.0,0.125,0.0,0.25,0.0,0.125,1.5,0.5,0.5,1,0,1,0,1,0,1,0,0,0,3,0,0,0,0,0,0
2,5.0,5,5,0,0,0,1,22,14.0,22.0,2,2,0,0,4,0,2,0,0,0.0,0.0,0.0,0.0,0.545455,0.333333,0.0,0.0,0.0,0.590909,0.136364,0.045455,0.227273,21,0.954545,3.165869,4.477215,0.98495,199.250081,0.0,0.954545,1.0,1.0,0.707107,0.083333,0.363636,0.166667,0.0,0.166667,0.0,0.333333,0.0,0.333333,-1.0,1.0,1.0,0,0,0,0,0,2,0,0,0,0,3,0,0,0,0,0,0
3,5.0,5,5,0,0,0,1,26,18.0,26.0,3,3,0,2,1,1,2,0,0,0.0,0.0,0.0,0.0,0.423077,0.272727,0.0,0.0,0.0,0.730769,0.038462,0.0,0.230769,23,0.884615,3.189526,4.510671,0.96237,231.001246,0.0,0.884615,1.0,2.0,1.0,0.181818,0.230769,0.181818,0.090909,0.272727,0.0,0.0,0.0,0.0,0.0,0.5,1.0,1,0,1,0,0,0,1,12,3,0,3,0,0,0,0,0,0
4,4.0,4,4,0,0,0,1,11,6.0,11.0,1,1,0,1,0,1,1,0,0,0.0,0.0,0.0,0.0,0.545455,0.5,0.0,0.0,0.0,0.545455,0.090909,0.090909,0.272727,11,1.0,2.345208,3.316625,1.0,0.0,0.0,1.0,1.0,1.0,0.707107,0.166667,0.363636,0.166667,0.0,0.166667,0.0,0.5,0.0,0.0,-1.0,0.0,1.0,1,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0


In [None]:
y_pred_inspector = inspector_tree.predict(X_inspector)

In [None]:
print(classification_report(y, y_pred_inspector))

              precision    recall  f1-score   support

      adpq_1       0.00      0.00      0.00         1
      adpq_2       0.20      0.45      0.28        22
      adpq_3       0.81      0.43      0.57        90

    accuracy                           0.43       113
   macro avg       0.34      0.30      0.28       113
weighted avg       0.69      0.43      0.51       113



In [None]:
zero_level["Level (Hand)"] = zero_level["Level (Hand)"].apply(lambda x: f"adpq_{x}")

In [None]:
zero_level["Level (ML)"] = y_pred_bert

In [None]:
zero_level[zero_level["Level (Hand)"] == zero_level["Level (ML)"]]

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,id,Sentence (original),Sentence (changed),Right answer,Error type,Error type+,Level (Hand),path_in_REALEC,Sentence,Error span,LevelAuto,Level (ML)
0,4,6,50904,If in North Africa and South Asia there <b>is...,If in North Africa and South Asia there <b>is...,were,Agreement_errors //Tense_choice,Agreement_errors,adpq_3,https://realec.org/index.xhtml#/exam/Exam2017/...,If in North Africa and South Asia there <b>is...,is,0,adpq_3
1,16,19,39505,"Overall, the chart gives information that aro...","Overall, the chart gives information that aro...",a bigger proportion,Comparison_degree,Comparison_degree,adpq_3,https://realec.org/index.xhtml#/exam/Exam2016/...,"Overall, the chart gives information that aro...",the biggest proportion,0,adpq_3
3,20,23,69626,"But on the other hand, there are a lot of <b>...",,people who,Relative_clause,Relative_clause,adpq_3,https://realec.org/index.xhtml#/exam/Exam2017/...,"But on the other hand, there are a lot of <b>...","people, who",0,adpq_3
10,51,61,60856,<b>Unemployment rate</b> in Africa was almost...,The unemployment rate in Africa was almost </...,4 times as high as,Comparative_constr,Articles,adpq_3,https://realec.org/index.xhtml#/exam/Exam2017/...,The unemployment rate in Africa was almost <b...,4 times higher than,0,adpq_3
11,52,62,55642,"Someone could treat me as an idealist, but I ...","Someone could treat me as an idealist, but I ...",volunteering,Spelling //Category_confusion,Spelling,adpq_3,https://realec.org/index.xhtml#/2012-2014/esl_...,"Someone could treat me as an idealist, but I ...",volontier,0,adpq_3
14,57,66,24347,There is no shame if men are good at law and ...,,at,Lack_par_constr,Lack_par_constr,adpq_3,https://realec.org/index.xhtml#/exam/Exam2014/...,There is no shame if men are good at law and ...,in,0,adpq_3
15,66,76,74718,"For example, an athlete can use drugs, which ...","For example, an athlete can use drugs which <...",are not,lex_item_choice OR Redundant_comp_sent,Voice,adpq_3,https://realec.org/index.xhtml#/exam/Exam2017/...,"For example, an athlete can use drugs which <...",are get not,0,adpq_3
18,79,90,16427,"Secondly, mowement of a business in developin...","Secondly, <b>mowement of a business<b> to dev...",moving a business,Spelling //Category_confusion,Tense_choice,adpq_3,https://realec.org/index.xhtml#/exam/Exam2016/...,"Secondly, <b>mowement of a business</b> to de...",mowement of a business,0,adpq_3
23,136,154,10120,"To summarize, the forecast for these two coun...","To summarize, the forecast for these two coun...",the population of Yemen,Structure_confusion,Articles,adpq_3,https://realec.org/index.xhtml#/exam/Exam2014/...,"To summarize, the forecast for these two coun...",Yemen population,0,adpq_3
25,147,166,4906,"However, personally, I suppose that it is bet...","However, personally, I suppose that it is bet...",as early as,Comparative_constr,Adjectives,adpq_2,https://realec.org/index.xhtml#/exam/Exam2014/...,"However, personally, I suppose that it is bet...",as earlier as,0,adpq_2


In [None]:
len(zero_level[zero_level["Level (Hand)"] == zero_level["Level (ML)"]])

53

In [None]:
(388+53)/581

0.7590361445783133

In [None]:
import sklearn

In [None]:
with open('drive/MyDrive/QuestionLevelPrediction/TreeBERT.pickle','rb') as inp:
  bert_tree = pickle.load(inp)

In [None]:
y_pred_bert_tree = bert_tree.predict(X_bert)

In [None]:
print(classification_report(y, y_pred_bert_tree))

              precision    recall  f1-score   support

      adpq_1       0.00      0.00      0.00         1
      adpq_2       0.22      0.41      0.29        22
      adpq_3       0.82      0.56      0.66        90

    accuracy                           0.52       113
   macro avg       0.35      0.32      0.32       113
weighted avg       0.70      0.52      0.58       113



In [None]:
zero_level["Level (ML)"] = y_pred_bert_tree

In [None]:
len(zero_level[zero_level["Level (Hand)"] == zero_level["Level (ML)"]])

59

In [None]:
(388+59)/581

0.7693631669535284