In [2]:
#!apt install -qq enchant
#!pip install pyenchant
#!pip install pyspellchecker
#!pip install transformers
#!pip install conllu
#!pip install ufal.udpipe

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from gensim.models import KeyedVectors
from transformers import BertModel, BertTokenizer, GPT2Model, GPT2Tokenizer
from tqdm import tqdm_notebook
from math import ceil

import re
import os
import html
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import pickle

import nltk
#nltk.download('punkt')
from nltk import word_tokenize

#from google.colab import drive

In [None]:
!pip uninstall scikit-learn
!pip install scikit-learn==0.21.2

In [None]:
!git clone https://github.com/lcl-hse/lab_inspector.git

In [None]:
os.listdir('lab_inspector')

In [3]:
class MyBatchIterator:
  def __init__(self, texts, batch_size):
    self.texts = texts
    self.batch_size = batch_size
  
  def __iter__(self):
    self.start = 0
    return self
  
  def __next__(self):
    if self.start >= len(self.texts):
      raise StopIteration
    batch = self.texts[self.start:self.start+self.batch_size]
    self.start += self.batch_size
    return batch
  
  def __len__(self):
    return ceil(len(self.texts)/self.batch_size)

class BERTEmbedder:
  def __init__(self, model_name):
    self.tokenizer = BertTokenizer.from_pretrained(model_name)
    self.model = BertModel.from_pretrained(model_name)
  
  def process(self, texts, flatten_method='pooler'):
    tokenized = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
    ids = tokenized['input_ids']
    mask = tokenized['attention_mask']
    processed = self.model(input_ids=ids, attention_mask=mask)

    if flatten_method == 'average':
      return processed['last_hidden_state'].detach().numpy().mean(axis=1)
    elif flatten_method == 'pooler':
      return processed['pooler_output'].detach().numpy()
  
  def process_sample(self, texts, batch_size=4, flatten_method='pooler'):
    text_iter = MyBatchIterator(texts, batch_size=batch_size)
    batches = []

    for batch in tqdm_notebook(text_iter, total=len(text_iter)):
      batches.append(self.process(batch, flatten_method=flatten_method))
    
    return np.concatenate(batches, axis=0)

class GPT2Embedder(BERTEmbedder):
  def __init__(self, model_name):
    self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    self.tokenizer.pad_token = self.tokenizer.eos_token
    self.model = GPT2Model.from_pretrained(model_name)
  
  def process(self, texts, flatten_method='pooler'):
    tokenized = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
    ids = tokenized['input_ids']
    mask = tokenized['attention_mask']
    processed = self.model(input_ids=ids, attention_mask=mask)

    if flatten_method == 'average':
      return processed['last_hidden_state'].detach().numpy().mean(axis=1)
    elif flatten_method == 'pooler':
      return processed['pooler_output'].detach().numpy()

In [None]:
drive.mount("/content/drive")

In [4]:
zero_level = pd.read_csv("ZeroLevel.csv")

In [None]:
zero_level

In [10]:
with open('BertEmbedder.pickle','rb') as inp:
  bert_embedder = pickle.load(inp)

# with open('GPT2Embedder.pickle','rb') as inp:
#   gpt2_embedder = pickle.load(inp)

# with open('BernoulliBERT.pickle','rb') as inp:
#   bernoulli_bert = pickle.load(inp)

# with open('BernoulliGPT2.pickle','rb') as inp:
#   bernoulli_gpt2 = pickle.load(inp)

FileNotFoundError: [Errno 2] No such file or directory: 'BernoulliBERT.pickle'

In [None]:
with open('drive/My Drive/QuestionLevelPrediction/InspectorTree.pickle','rb') as inp:
  inspector_tree = pickle.load(inp)

In [7]:
X = zero_level["Sentence"].tolist()
y = zero_level["Level (Hand)"].apply(lambda x: f"adpq_{x}")

In [16]:
len(X)

113

In [8]:
X_bert = bert_embedder.process_sample(X, flatten_method="average")
#X_gpt2 = gpt2_embedder.process_sample(X, flatten_method="average")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=29.0), HTML(value='')))




In [9]:
y_pred_bert = bernoulli_bert.predict(X_bert)
#y_pred_gpt2 = bernoulli_gpt2.predict(X_gpt2)

NameError: name 'bernoulli_bert' is not defined

In [None]:
print(classification_report(y, y_pred_bert))

In [None]:
print(classification_report(y, y_pred_gpt2))

In [None]:
os.chdir("lab_inspector/Inspector")

In [None]:
from main import main as inspect

In [None]:
X_inspector = []

for question in tqdm_notebook(X, total=len(X)):
  X_inspector.append(inspect(question))

X_inspector = pd.DataFrame(X_inspector, index=zero_level.index)

In [None]:
X_inspector.head()

In [None]:
y_pred_inspector = inspector_tree.predict(X_inspector)

In [None]:
print(classification_report(y, y_pred_inspector))

In [None]:
zero_level["Level (Hand)"] = zero_level["Level (Hand)"].apply(lambda x: f"adpq_{x}")

In [None]:
zero_level["Level (ML)"] = y_pred_bert

In [None]:
zero_level[zero_level["Level (Hand)"] == zero_level["Level (ML)"]]

In [None]:
len(zero_level[zero_level["Level (Hand)"] == zero_level["Level (ML)"]])

In [None]:
(388+53)/581

In [None]:
import sklearn

In [11]:
with open('TreeBERT.pickle','rb') as inp:
  bert_tree = pickle.load(inp)



In [12]:
y_pred_bert_tree = bert_tree.predict(X_bert)

In [13]:
print(classification_report(y, y_pred_bert_tree))

              precision    recall  f1-score   support

      adpq_1       0.00      0.00      0.00         1
      adpq_2       0.22      0.41      0.29        22
      adpq_3       0.82      0.56      0.66        90

    accuracy                           0.52       113
   macro avg       0.35      0.32      0.32       113
weighted avg       0.70      0.52      0.58       113



In [15]:
from sklearn.metrics import accuracy_score

In [18]:
print(accuracy_score(y, y_pred_bert_tree, normalize=False))

59


In [None]:
zero_level["Level (ML)"] = y_pred_bert_tree

In [None]:
len(zero_level[zero_level["Level (Hand)"] == zero_level["Level (ML)"]])

In [None]:
(388+59)/581

In [23]:
with open("BertModel.pickle", 'wb') as outp1:
    pickle.dump(bert_embedder.model, outp1)

with open("BertTokenizer.pickle", 'wb') as outp2:
    pickle.dump(bert_embedder.tokenizer, outp2)

In [22]:
help(pickle.dump)

Help on built-in function dump in module _pickle:

dump(obj, file, protocol=None, *, fix_imports=True)
    Write a pickled representation of obj to the open file object file.
    
    This is equivalent to ``Pickler(file, protocol).dump(obj)``, but may
    be more efficient.
    
    The optional *protocol* argument tells the pickler to use the given
    protocol supported protocols are 0, 1, 2, 3 and 4.  The default
    protocol is 3; a backward-incompatible protocol designed for Python 3.
    
    Specifying a negative protocol version selects the highest protocol
    version supported.  The higher the protocol used, the more recent the
    version of Python needed to read the pickle produced.
    
    The *file* argument must have a write() method that accepts a single
    bytes argument.  It can thus be a file object opened for binary
    writing, an io.BytesIO instance, or any other custom object that meets
    this interface.
    
    If *fix_imports* is True and protocol is less