In [None]:
!apt install -qq enchant
!pip install pyenchant
!pip install pyspellchecker
!pip install transformers
!pip install conllu
!pip install ufal.udpipe

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from gensim.models import KeyedVectors
from transformers import BertModel, BertTokenizer, GPT2Model, GPT2Tokenizer
from tqdm import tqdm_notebook
from math import ceil

import re
import os
import html
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import nltk
nltk.download('punkt')
from nltk import word_tokenize

enchant is already the newest version (1.6.0-11.1).
0 upgraded, 0 newly installed, 0 to remove and 40 not upgraded.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
import pickle

In [None]:
from google.colab import files

In [None]:
!pip uninstall scikit-learn
!pip install scikit-learn==0.21.2

Found existing installation: scikit-learn 0.22.2.post1
Uninstalling scikit-learn-0.22.2.post1:
  Would remove:
    /usr/local/lib/python3.7/dist-packages/scikit_learn-0.22.2.post1.dist-info/*
    /usr/local/lib/python3.7/dist-packages/sklearn/*
Proceed (y/n)? y
  Successfully uninstalled scikit-learn-0.22.2.post1
Collecting scikit-learn==0.21.2
  Downloading scikit_learn-0.21.2-cp37-cp37m-manylinux1_x86_64.whl (6.7 MB)
[K     |████████████████████████████████| 6.7 MB 15.7 MB/s 
Installing collected packages: scikit-learn
Successfully installed scikit-learn-0.21.2


Установим Инспектор:

In [None]:
!git clone https://github.com/lcl-hse/lab_inspector.git

Cloning into 'lab_inspector'...
remote: Enumerating objects: 21918, done.[K
remote: Counting objects: 100% (34/34), done.[K
remote: Compressing objects: 100% (31/31), done.[K
remote: Total 21918 (delta 5), reused 27 (delta 2), pack-reused 21884[K
Receiving objects: 100% (21918/21918), 93.60 MiB | 23.88 MiB/s, done.
Resolving deltas: 100% (5494/5494), done.
Checking out files: 100% (26120/26120), done.


In [None]:
os.listdir('lab_inspector')

['R', 'Inspector', 'README.md', '.git']

Установим модель Word2Vec:

In [None]:
!wget http://vectors.nlpl.eu/repository/20/40.zip

--2021-09-06 10:46:38--  http://vectors.nlpl.eu/repository/20/40.zip
Resolving vectors.nlpl.eu (vectors.nlpl.eu)... 129.240.189.181
Connecting to vectors.nlpl.eu (vectors.nlpl.eu)|129.240.189.181|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3033545506 (2.8G) [application/zip]
Saving to: ‘40.zip’


2021-09-06 10:48:25 (27.1 MB/s) - ‘40.zip’ saved [3033545506/3033545506]



In [None]:
!unzip 40.zip

Archive:  40.zip
  inflating: LIST                    
  inflating: meta.json               
  inflating: model.bin               
  inflating: model.txt               
  inflating: README                  


## Load data

In [None]:
from google.colab import drive
drive.mount('/content/drive')
questions = pd.read_excel('drive/My Drive/QuestionLevelPrediction/questions_final.xlsx', index_col='Unnamed: 0')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
def clean_question_text(qtext):
  quest_text = re.sub('<.*?>','',qtext)
  quest_text = html.unescape(quest_text)
  return quest_text

In [None]:
questions

Unnamed: 0,question,category,Unnamed: 3,collection
0,The given diagram <b>demonstrate</b> the propo...,1,,Placement test 2017
1,After a stable period we can see a huge increa...,adpq_2,2,Placement test 2017
2,"Thus, the percentage of people <b>65 and plus ...",adpq_3,3,Placement test 2017
3,And it comes after many years of their efforts...,adpq_2,15,Placement test 2017
4,It is clear that Kyoto and Los Angeles undergr...,adpq_2,8,Placement test 2017
...,...,...,...,...
732,But their richest part of the population <b>us...,adpq_1,,Current version of test
733,From 1960s it gradually decreased to 5 per cen...,adpq_3,Comparative_constr,Current version of test
734,And it is not pointless making children who la...,pointless doing? or making to do?,,Current version of test
735,Even if <b>the child</b> doesn&#8217;t have tr...,adpq_1,Art_choice,Current version of test


In [None]:
questions = questions.dropna(axis=0, subset=['category']).dropna(axis=0, subset=['question'])
questions['question'] = questions['question'].apply(clean_question_text)

In [None]:
len(questions)

703

In [None]:
questions['category'].unique()

array([1, 'adpq_2', 'adpq_3', 'adpq_2 or 3', 'adpq_1', 'adpq_3 ',
       'adpq_2 ', 'adpq_?', '??',
       "adpq_2 (если считать, что 'produce' - noun, то часть речи не меняется)",
       'область ошибки-?', 'correct?', 'adpq_2?', 'to be removed',
       'to be removed?', 'adpq_', 'adpq_1 or to be removed',
       'layer? adpq_2', 'current the? adpq_1', 'adpq_1 (the same)',
       'the numbers? adpq_2', 'pointless doing? or making to do?'],
      dtype=object)

In [None]:
questions = questions.loc[questions['category'].apply(lambda x: x in (1,'adpq_1','adpq_2', 'adpq_3'))]

In [None]:
questions['category'] = questions['category'].apply(lambda x: f"adpq_{x}" if type(x)==int else x)

In [None]:
len(questions)

640

## Baseline: TF-IDF

In [None]:
X = questions['question']
y = questions['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
clfs = [
        LogisticRegression, RidgeClassifier,
        RandomForestClassifier, SVC,
        DecisionTreeClassifier, BernoulliNB
]

In [None]:
vec = TfidfVectorizer().fit(X_train)

In [None]:
X_train_ = vec.transform(X_train)
X_test_ = vec.transform(X_test)

In [None]:
len(X_train)

480

In [None]:
reducer = TruncatedSVD(n_components=450, random_state=42).fit(X_train_)

In [None]:
reducer.explained_variance_ratio_.sum()

0.9950345494100687

In [None]:
X_train_, X_test_ = reducer.transform(X_train_), reducer.transform(X_test_)

In [None]:
clfs

[sklearn.linear_model.logistic.LogisticRegression,
 sklearn.linear_model.ridge.RidgeClassifier,
 sklearn.ensemble.forest.RandomForestClassifier,
 sklearn.svm.classes.SVC,
 sklearn.tree.tree.DecisionTreeClassifier,
 sklearn.naive_bayes.BernoulliNB]

In [None]:
result = []

for clf in clfs:
  try:
    scorer = clf(random_state=42).fit(X_train_, y_train)
  except:
    scorer = clf().fit(X_train_, y_train)
  predicted = scorer.predict(X_test_)
  report = classification_report(y_test, predicted, output_dict=True)
  entry = ({
      'algorithm': clf.__name__,
  })

  for key, val in report.items():
    if type(val) == dict:
      for k, v in val.items():
        entry[f"{key}_{k}"] = v
    else:
      entry[key] = val
  
  result.append(entry)
result = pd.DataFrame(result)

  'precision', 'predicted', average, warn_for)


In [None]:
result

Unnamed: 0,algorithm,adpq_1_precision,adpq_1_recall,adpq_1_f1-score,adpq_1_support,adpq_2_precision,adpq_2_recall,adpq_2_f1-score,adpq_2_support,adpq_3_precision,adpq_3_recall,adpq_3_f1-score,adpq_3_support,accuracy,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support
0,LogisticRegression,0.0,0.0,0.0,15,0.46281,0.746667,0.571429,75,0.447368,0.242857,0.314815,70,0.45625,0.303393,0.329841,0.295414,160,0.412666,0.45625,0.405589,160
1,RidgeClassifier,0.25,0.066667,0.105263,15,0.479167,0.613333,0.538012,75,0.483333,0.414286,0.446154,70,0.475,0.404167,0.364762,0.363143,160,0.459505,0.475,0.457254,160
2,RandomForestClassifier,1.0,0.066667,0.125,15,0.479675,0.786667,0.59596,75,0.5,0.257143,0.339623,70,0.4875,0.659892,0.370159,0.353527,160,0.537348,0.4875,0.43966,160
3,SVC,0.0,0.0,0.0,15,0.46875,1.0,0.638298,75,0.0,0.0,0.0,70,0.46875,0.15625,0.333333,0.212766,160,0.219727,0.46875,0.299202,160
4,DecisionTreeClassifier,0.111111,0.066667,0.083333,15,0.424242,0.56,0.482759,75,0.365385,0.271429,0.311475,70,0.3875,0.300246,0.299365,0.292522,160,0.369136,0.3875,0.370376,160
5,BernoulliNB,0.222222,0.133333,0.166667,15,0.506173,0.546667,0.525641,75,0.457143,0.457143,0.457143,70,0.46875,0.395179,0.379048,0.38315,160,0.458102,0.46875,0.462019,160


In [None]:
y_train.value_counts()/len(y_train)

adpq_2    0.495833
adpq_3    0.395833
adpq_1    0.108333
Name: category, dtype: float64

## Word2Vec

In [None]:
class W2VEmbedder:
  def __init__(self, model_path):
    self.model = KeyedVectors.load_word2vec_format(model_path, binary=True)
  
  def process(self, text):
    return np.array([self.model[word.lower()] for word in word_tokenize(text, language="english") if word in self.model.vocab]).mean(axis=0)
  
  def process_texts(self, texts):
    return np.array([self.process(text) for text in texts])

In [None]:
embedder = W2VEmbedder('model.bin')

In [None]:
%%time
X = embedder.process_texts(questions['question'])

CPU times: user 181 ms, sys: 1.87 ms, total: 183 ms
Wall time: 203 ms


In [None]:
X.shape

(640, 100)

In [None]:
y = questions["category"]

In [None]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clfs = [
        LogisticRegression, RidgeClassifier,
        RandomForestClassifier, SVC,
        DecisionTreeClassifier, BernoulliNB
]

result = []

for clf in clfs:
  try:
    scorer = clf(random_state=42).fit(X_train, y_train)
  except:
    scorer = clf().fit(X_train, y_train)
  predicted = scorer.predict(X_test)
  report = classification_report(y_test, predicted, output_dict=True)
  entry = ({
      'algorithm': clf.__name__,
  })

  for key, val in report.items():
    if type(val) == dict:
      for k, v in val.items():
        entry[f"{key}_{k}"] = v
    else:
      entry[key] = val
  
  result.append(entry)
result = pd.DataFrame(result)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


CPU times: user 263 ms, sys: 151 ms, total: 414 ms
Wall time: 292 ms


  'precision', 'predicted', average, warn_for)


In [None]:
result

Unnamed: 0,algorithm,adpq_1_precision,adpq_1_recall,adpq_1_f1-score,adpq_1_support,adpq_2_precision,adpq_2_recall,adpq_2_f1-score,adpq_2_support,adpq_3_precision,adpq_3_recall,adpq_3_f1-score,adpq_3_support,accuracy,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support
0,LogisticRegression,0.0,0.0,0.0,15,0.475806,0.786667,0.592965,75,0.444444,0.228571,0.301887,70,0.46875,0.30675,0.338413,0.298284,160,0.417479,0.46875,0.410028,160
1,RidgeClassifier,0.0,0.0,0.0,15,0.490196,0.666667,0.564972,75,0.465517,0.385714,0.421875,70,0.48125,0.318571,0.350794,0.328949,160,0.433443,0.48125,0.449401,160
2,RandomForestClassifier,0.166667,0.133333,0.148148,15,0.48,0.64,0.548571,75,0.4375,0.3,0.355932,70,0.44375,0.361389,0.357778,0.350884,160,0.432031,0.44375,0.426752,160
3,SVC,0.0,0.0,0.0,15,0.46875,1.0,0.638298,75,0.0,0.0,0.0,70,0.46875,0.15625,0.333333,0.212766,160,0.219727,0.46875,0.299202,160
4,DecisionTreeClassifier,0.190476,0.266667,0.222222,15,0.5,0.56,0.528302,75,0.381818,0.3,0.336,70,0.41875,0.357431,0.375556,0.362175,160,0.419278,0.41875,0.415475,160
5,BernoulliNB,0.166667,0.066667,0.095238,15,0.48913,0.6,0.538922,75,0.483871,0.428571,0.454545,70,0.475,0.379889,0.365079,0.362902,160,0.456598,0.475,0.460412,160


## BERT

In [None]:
class MyBatchIterator:
  def __init__(self, texts, batch_size):
    self.texts = texts
    self.batch_size = batch_size
  
  def __iter__(self):
    self.start = 0
    return self
  
  def __next__(self):
    if self.start >= len(self.texts):
      raise StopIteration
    batch = self.texts[self.start:self.start+self.batch_size]
    self.start += self.batch_size
    return batch
  
  def __len__(self):
    return ceil(len(self.texts)/self.batch_size)

class BERTEmbedder:
  def __init__(self, model_name):
    self.tokenizer = BertTokenizer.from_pretrained(model_name)
    self.model = BertModel.from_pretrained(model_name)
  
  def process(self, texts, flatten_method='pooler'):
    tokenized = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
    ids = tokenized['input_ids']
    mask = tokenized['attention_mask']
    processed = self.model(input_ids=ids, attention_mask=mask)

    if flatten_method == 'average':
      return processed['last_hidden_state'].detach().numpy().mean(axis=1)
    elif flatten_method == 'pooler':
      return processed['pooler_output'].detach().numpy()
  
  def process_sample(self, texts, batch_size=4, flatten_method='pooler'):
    text_iter = MyBatchIterator(texts, batch_size=batch_size)
    batches = []

    for batch in tqdm_notebook(text_iter, total=len(text_iter)):
      batches.append(self.process(batch, flatten_method=flatten_method))
    
    return np.concatenate(batches, axis=0)

In [None]:
bert_embedder = BERTEmbedder('bert-base-cased')

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
%%time
X = bert_embedder.process_sample(questions['question'].tolist())

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/160 [00:00<?, ?it/s]

CPU times: user 1min 34s, sys: 2.46 s, total: 1min 37s
Wall time: 1min 37s


In [None]:
%%time
y = questions["category"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clfs = [
        RidgeClassifier,
        RandomForestClassifier, SVC,
        DecisionTreeClassifier, BernoulliNB
]

result = []

for clf in clfs:
  try:
    scorer = clf(random_state=42).fit(X_train, y_train)
  except:
    scorer = clf().fit(X_train, y_train)
  predicted = scorer.predict(X_test)
  report = classification_report(y_test, predicted, output_dict=True)
  entry = ({
      'algorithm': clf.__name__,
  })

  for key, val in report.items():
    if type(val) == dict:
      for k, v in val.items():
        entry[f"{key}_{k}"] = v
    else:
      entry[key] = val
  
  result.append(entry)
result = pd.DataFrame(result)

  'precision', 'predicted', average, warn_for)


CPU times: user 969 ms, sys: 184 ms, total: 1.15 s
Wall time: 1 s


In [None]:
result

Unnamed: 0,algorithm,adpq_1_precision,adpq_1_recall,adpq_1_f1-score,adpq_1_support,adpq_2_precision,adpq_2_recall,adpq_2_f1-score,adpq_2_support,adpq_3_precision,adpq_3_recall,adpq_3_f1-score,adpq_3_support,accuracy,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support
0,RidgeClassifier,0.0,0.0,0.0,15,0.447619,0.626667,0.522222,75,0.396226,0.3,0.341463,70,0.425,0.281282,0.308889,0.287895,160,0.38317,0.425,0.394182,160
1,RandomForestClassifier,0.0,0.0,0.0,15,0.46875,0.6,0.526316,75,0.466667,0.4,0.430769,70,0.45625,0.311806,0.333333,0.319028,160,0.423893,0.45625,0.435172,160
2,SVC,0.0,0.0,0.0,15,0.46875,1.0,0.638298,75,0.0,0.0,0.0,70,0.46875,0.15625,0.333333,0.212766,160,0.219727,0.46875,0.299202,160
3,DecisionTreeClassifier,0.055556,0.066667,0.060606,15,0.409091,0.48,0.441718,75,0.351852,0.271429,0.306452,70,0.35,0.272166,0.272698,0.269592,160,0.350905,0.35,0.34681,160
4,BernoulliNB,0.333333,0.066667,0.111111,15,0.419048,0.586667,0.488889,75,0.384615,0.285714,0.327869,70,0.40625,0.378999,0.313016,0.30929,160,0.395948,0.40625,0.383026,160


Now let's try averaging BERT vectors

In [None]:
%%time
X = bert_embedder.process_sample(questions['question'].tolist(),
                                 flatten_method='average')

y = questions["category"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clfs = [
        LogisticRegression, RidgeClassifier,
        RandomForestClassifier, SVC,
        DecisionTreeClassifier, BernoulliNB
]

result = []

for clf in clfs:
  try:
    scorer = clf(random_state=42).fit(X_train, y_train)
  except:
    scorer = clf().fit(X_train, y_train)
  predicted = scorer.predict(X_test)
  report = classification_report(y_test, predicted, output_dict=True)
  entry = ({
      'algorithm': clf.__name__,
  })

  for key, val in report.items():
    if type(val) == dict:
      for k, v in val.items():
        entry[f"{key}_{k}"] = v
    else:
      entry[key] = val
  
  result.append(entry)
result = pd.DataFrame(result)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/160 [00:00<?, ?it/s]

  'precision', 'predicted', average, warn_for)


CPU times: user 1min 34s, sys: 12 s, total: 1min 46s
Wall time: 1min 46s


In [None]:
result

Unnamed: 0,algorithm,adpq_1_precision,adpq_1_recall,adpq_1_f1-score,adpq_1_support,adpq_2_precision,adpq_2_recall,adpq_2_f1-score,adpq_2_support,adpq_3_precision,adpq_3_recall,adpq_3_f1-score,adpq_3_support,accuracy,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support
0,LogisticRegression,0.285714,0.133333,0.181818,15,0.485149,0.653333,0.556818,75,0.480769,0.357143,0.409836,70,0.475,0.417211,0.38127,0.382824,160,0.464536,0.475,0.457357,160
1,RidgeClassifier,0.3,0.2,0.24,15,0.452632,0.573333,0.505882,75,0.381818,0.3,0.336,70,0.41875,0.37815,0.357778,0.360627,160,0.407342,0.41875,0.406632,160
2,RandomForestClassifier,0.0,0.0,0.0,15,0.491379,0.76,0.596859,75,0.512821,0.285714,0.366972,70,0.48125,0.334733,0.348571,0.321277,160,0.454693,0.48125,0.440328,160
3,SVC,0.0,0.0,0.0,15,0.46875,1.0,0.638298,75,0.0,0.0,0.0,70,0.46875,0.15625,0.333333,0.212766,160,0.219727,0.46875,0.299202,160
4,DecisionTreeClassifier,0.083333,0.066667,0.074074,15,0.5,0.573333,0.534161,75,0.532258,0.471429,0.5,70,0.48125,0.371864,0.370476,0.369412,160,0.47505,0.48125,0.476083,160
5,BernoulliNB,0.090909,0.133333,0.108108,15,0.486842,0.493333,0.490066,75,0.435484,0.385714,0.409091,70,0.4125,0.337745,0.33746,0.335755,160,0.427254,0.4125,0.418831,160


Let's try with BERT large:

In [None]:
%%time
embedder = BERTEmbedder('bert-large-cased')
X = embedder.process_sample(questions['question'].tolist(),
                                 flatten_method='average')

y = questions["category"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clfs = [
        LogisticRegression, RidgeClassifier,
        RandomForestClassifier, SVC,
        DecisionTreeClassifier, BernoulliNB
]

result = []

for clf in clfs:
  try:
    scorer = clf(random_state=42).fit(X_train, y_train)
  except:
    scorer = clf().fit(X_train, y_train)
  predicted = scorer.predict(X_test)
  report = classification_report(y_test, predicted, output_dict=True)
  entry = ({
      'algorithm': clf.__name__,
  })

  for key, val in report.items():
    if type(val) == dict:
      for k, v in val.items():
        entry[f"{key}_{k}"] = v
    else:
      entry[key] = val
  
  result.append(entry)
result = pd.DataFrame(result)

Some weights of the model checkpoint at bert-large-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/160 [00:00<?, ?it/s]

  'precision', 'predicted', average, warn_for)


CPU times: user 5min 29s, sys: 12.6 s, total: 5min 42s
Wall time: 5min 41s


In [None]:
result

Unnamed: 0,algorithm,adpq_1_precision,adpq_1_recall,adpq_1_f1-score,adpq_1_support,adpq_2_precision,adpq_2_recall,adpq_2_f1-score,adpq_2_support,adpq_3_precision,adpq_3_recall,adpq_3_f1-score,adpq_3_support,accuracy,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support
0,LogisticRegression,0.333333,0.133333,0.190476,15,0.489796,0.64,0.554913,75,0.5,0.4,0.444444,70,0.4875,0.441043,0.391111,0.396611,160,0.479592,0.4875,0.472417,160
1,RidgeClassifier,0.125,0.133333,0.129032,15,0.480519,0.493333,0.486842,75,0.477612,0.457143,0.467153,70,0.44375,0.361044,0.36127,0.361009,160,0.445917,0.44375,0.444684,160
2,RandomForestClassifier,0.0,0.0,0.0,15,0.482143,0.72,0.57754,75,0.545455,0.342857,0.421053,70,0.4875,0.342532,0.354286,0.332864,160,0.464641,0.4875,0.454932,160
3,SVC,0.0,0.0,0.0,15,0.46875,1.0,0.638298,75,0.0,0.0,0.0,70,0.46875,0.15625,0.333333,0.212766,160,0.219727,0.46875,0.299202,160
4,DecisionTreeClassifier,0.1,0.133333,0.114286,15,0.488889,0.586667,0.533333,75,0.4,0.285714,0.333333,70,0.4125,0.32963,0.335238,0.326984,160,0.413542,0.4125,0.406548,160
5,BernoulliNB,0.090909,0.133333,0.108108,15,0.4625,0.493333,0.477419,75,0.482759,0.4,0.4375,70,0.41875,0.345389,0.342222,0.341009,160,0.436526,0.41875,0.425332,160


Попробуем с GPT-2:

In [None]:
class GPT2Embedder(BERTEmbedder):
  def __init__(self, model_name):
    self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    self.tokenizer.pad_token = self.tokenizer.eos_token
    self.model = GPT2Model.from_pretrained(model_name)
  
  def process(self, texts, flatten_method='pooler'):
    tokenized = self.tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
    ids = tokenized['input_ids']
    mask = tokenized['attention_mask']
    processed = self.model(input_ids=ids, attention_mask=mask)

    if flatten_method == 'average':
      return processed['last_hidden_state'].detach().numpy().mean(axis=1)
    elif flatten_method == 'pooler':
      return processed['pooler_output'].detach().numpy()

In [None]:
%%time
embedder = GPT2Embedder("gpt2")
X = embedder.process_sample(questions['question'].tolist(),
                                 flatten_method='average')

y = questions["category"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clfs = [
        LogisticRegression, RidgeClassifier,
        RandomForestClassifier, SVC,
        DecisionTreeClassifier, BernoulliNB
]

result = []

for clf in clfs:
  try:
    scorer = clf(random_state=42).fit(X_train, y_train)
  except:
    scorer = clf().fit(X_train, y_train)
  predicted = scorer.predict(X_test)
  report = classification_report(y_test, predicted, output_dict=True)
  entry = ({
      'algorithm': clf.__name__,
  })

  for key, val in report.items():
    if type(val) == dict:
      for k, v in val.items():
        entry[f"{key}_{k}"] = v
    else:
      entry[key] = val
  
  result.append(entry)
result = pd.DataFrame(result)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/160 [00:00<?, ?it/s]

  'precision', 'predicted', average, warn_for)


CPU times: user 1min 39s, sys: 18.6 s, total: 1min 58s
Wall time: 1min 57s


In [None]:
result

Unnamed: 0,algorithm,adpq_1_precision,adpq_1_recall,adpq_1_f1-score,adpq_1_support,adpq_2_precision,adpq_2_recall,adpq_2_f1-score,adpq_2_support,adpq_3_precision,adpq_3_recall,adpq_3_f1-score,adpq_3_support,accuracy,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support
0,LogisticRegression,0.0,0.0,0.0,15,0.47,0.626667,0.537143,75,0.518519,0.4,0.451613,70,0.46875,0.329506,0.342222,0.329585,160,0.447164,0.46875,0.449366,160
1,RidgeClassifier,0.1875,0.2,0.193548,15,0.483871,0.6,0.535714,75,0.509804,0.371429,0.429752,70,0.4625,0.393725,0.390476,0.386338,160,0.467432,0.4625,0.457278,160
2,RandomForestClassifier,0.0,0.0,0.0,15,0.465517,0.72,0.565445,75,0.4,0.228571,0.290909,70,0.4375,0.288506,0.31619,0.285451,160,0.393211,0.4375,0.392325,160
3,SVC,0.0,0.0,0.0,15,0.468254,0.786667,0.587065,75,0.470588,0.228571,0.307692,70,0.46875,0.312947,0.338413,0.298252,160,0.425376,0.46875,0.409802,160
4,DecisionTreeClassifier,0.24,0.4,0.3,15,0.5,0.56,0.528302,75,0.509804,0.371429,0.429752,70,0.4625,0.416601,0.44381,0.419351,160,0.479914,0.4625,0.463783,160
5,BernoulliNB,0.181818,0.266667,0.216216,15,0.547619,0.613333,0.578616,75,0.555556,0.428571,0.483871,70,0.5,0.428331,0.43619,0.426235,160,0.516797,0.5,0.50319,160


In [None]:
gpt2_embedder = embedder

## Inspector

In [None]:
os.chdir('lab_inspector/Inspector')

In [None]:
from main import main



In [None]:
inspector_df = []

for index, row in tqdm_notebook(questions.iterrows(), total=len(questions)):
  inspector_df.append(main(row['question']))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


  0%|          | 0/640 [00:00<?, ?it/s]

In [None]:
inspector_df = pd.DataFrame(inspector_df,
                            index=questions.index)

In [None]:
inspector_df.shape

(640, 72)

In [None]:
X, y = inspector_df, questions['category']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clfs = [
        LogisticRegression, RidgeClassifier,
        RandomForestClassifier, SVC,
        DecisionTreeClassifier, BernoulliNB
]

result = []

for clf in clfs:
  try:
    scorer = clf(random_state=42).fit(X_train, y_train)
  except:
    scorer = clf().fit(X_train, y_train)
  predicted = scorer.predict(X_test)
  report = classification_report(y_test, predicted, output_dict=True)
  entry = ({
      'algorithm': clf.__name__,
  })

  for key, val in report.items():
    if type(val) == dict:
      for k, v in val.items():
        entry[f"{key}_{k}"] = v
    else:
      entry[key] = val
  
  result.append(entry)
result = pd.DataFrame(result)

  'precision', 'predicted', average, warn_for)


In [None]:
result

Unnamed: 0,algorithm,adpq_1_precision,adpq_1_recall,adpq_1_f1-score,adpq_1_support,adpq_2_precision,adpq_2_recall,adpq_2_f1-score,adpq_2_support,adpq_3_precision,adpq_3_recall,adpq_3_f1-score,adpq_3_support,accuracy,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support
0,LogisticRegression,0.0,0.0,0.0,15,0.435185,0.626667,0.513661,75,0.384615,0.285714,0.327869,70,0.41875,0.273267,0.304127,0.28051,160,0.372262,0.41875,0.384221,160
1,RidgeClassifier,0.0,0.0,0.0,15,0.448598,0.64,0.527473,75,0.365385,0.271429,0.311475,70,0.41875,0.271328,0.30381,0.279649,160,0.370136,0.41875,0.383523,160
2,RandomForestClassifier,0.0,0.0,0.0,15,0.443396,0.626667,0.519337,75,0.42,0.3,0.35,70,0.425,0.287799,0.308889,0.289779,160,0.391592,0.425,0.396564,160
3,SVC,0.0,0.0,0.0,15,0.463768,0.853333,0.600939,75,0.333333,0.1,0.153846,70,0.44375,0.2657,0.317778,0.251595,160,0.363225,0.44375,0.348998,160
4,DecisionTreeClassifier,0.05,0.066667,0.057143,15,0.415584,0.426667,0.421053,75,0.365079,0.328571,0.345865,70,0.35,0.276888,0.273968,0.274687,160,0.359215,0.35,0.354041,160
5,BernoulliNB,0.333333,0.2,0.25,15,0.516854,0.613333,0.560976,75,0.483871,0.428571,0.454545,70,0.49375,0.444686,0.413968,0.42184,160,0.485219,0.49375,0.485258,160


### BERT + Inspector

In [None]:
bert_embedder = BERTEmbedder('bert-base-cased')
X2 = bert_embedder.process_sample(questions['question'].tolist(),
                                 flatten_method='average')

X = np.concatenate((X, X2), axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

clfs = [
        LogisticRegression, RidgeClassifier,
        RandomForestClassifier, SVC,
        DecisionTreeClassifier, BernoulliNB
]

result = []

for clf in clfs:
  try:
    scorer = clf(random_state=42).fit(X_train, y_train)
  except:
    scorer = clf().fit(X_train, y_train)
  predicted = scorer.predict(X_test)
  report = classification_report(y_test, predicted, output_dict=True)
  entry = ({
      'algorithm': clf.__name__,
  })

  for key, val in report.items():
    if type(val) == dict:
      for k, v in val.items():
        entry[f"{key}_{k}"] = v
    else:
      entry[key] = val
  
  result.append(entry)
result = pd.DataFrame(result)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/160 [00:00<?, ?it/s]

  'precision', 'predicted', average, warn_for)


In [None]:
result

Unnamed: 0,algorithm,adpq_1_precision,adpq_1_recall,adpq_1_f1-score,adpq_1_support,adpq_2_precision,adpq_2_recall,adpq_2_f1-score,adpq_2_support,adpq_3_precision,adpq_3_recall,adpq_3_f1-score,adpq_3_support,accuracy,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support
0,LogisticRegression,0.333333,0.133333,0.190476,15,0.479592,0.626667,0.543353,75,0.464286,0.371429,0.412698,70,0.46875,0.425737,0.377143,0.382176,160,0.459184,0.46875,0.453109,160
1,RidgeClassifier,0.181818,0.133333,0.153846,15,0.433333,0.52,0.472727,75,0.40678,0.342857,0.372093,70,0.40625,0.340644,0.332063,0.332889,160,0.398137,0.40625,0.398805,160
2,RandomForestClassifier,0.0,0.0,0.0,15,0.453704,0.653333,0.535519,75,0.510638,0.342857,0.410256,70,0.45625,0.321447,0.332063,0.315259,160,0.436078,0.45625,0.430512,160
3,SVC,0.0,0.0,0.0,15,0.485915,0.92,0.635945,75,0.666667,0.171429,0.272727,70,0.50625,0.384194,0.36381,0.302891,160,0.51944,0.50625,0.417417,160
4,DecisionTreeClassifier,0.090909,0.066667,0.076923,15,0.473118,0.586667,0.52381,75,0.464286,0.371429,0.412698,70,0.44375,0.342771,0.341587,0.33781,160,0.433422,0.44375,0.433303,160
5,BernoulliNB,0.130435,0.2,0.157895,15,0.486111,0.466667,0.47619,75,0.476923,0.442857,0.459259,70,0.43125,0.36449,0.369841,0.364448,160,0.448747,0.43125,0.438943,160


### BERT + TFIDF + Inspector

In [None]:
print(train_test_split(['a','b','c','d','e','f','g','h','i','j'], random_state=42))
print(train_test_split(list(range(10)), random_state=42))

[['a', 'h', 'c', 'j', 'e', 'd', 'g'], ['i', 'b', 'f']]
[[0, 7, 2, 9, 4, 3, 6], [8, 1, 5]]


In [None]:
text_train, text_test = train_test_split(questions['question'], random_state=42)

In [None]:
vec = TfidfVectorizer().fit(text_train)
X_train3 = vec.transform(text_train)
X_test3 = vec.transform(text_test)
svd = TruncatedSVD(n_components=500).fit(X_train3)
X_train3 = svd.transform(X_train3)
X_test3 = svd.transform(X_test3)

In [None]:
X_train = np.concatenate((X_train, X_train3), axis=1)
X_test = np.concatenate((X_test, X_test3), axis=1)

In [None]:
svd2 = TruncatedSVD(n_components=300).fit(X_train)
svd2.explained_variance_ratio_.sum()

0.9999780270940863

In [None]:
X_train = svd2.transform(X_train)
X_test = svd2.transform(X_test)

In [None]:
clfs = [
        LogisticRegression, RidgeClassifier,
        RandomForestClassifier, SVC,
        DecisionTreeClassifier, BernoulliNB
]

result = []

for clf in clfs:
  try:
    scorer = clf(random_state=42).fit(X_train, y_train)
  except:
    scorer = clf().fit(X_train, y_train)
  predicted = scorer.predict(X_test)
  report = classification_report(y_test, predicted, output_dict=True)
  entry = ({
      'algorithm': clf.__name__,
  })

  for key, val in report.items():
    if type(val) == dict:
      for k, v in val.items():
        entry[f"{key}_{k}"] = v
    else:
      entry[key] = val
  
  result.append(entry)
result = pd.DataFrame(result)



In [None]:
result

Unnamed: 0,algorithm,adpq_1_precision,adpq_1_recall,adpq_1_f1-score,adpq_1_support,adpq_2_precision,adpq_2_recall,adpq_2_f1-score,adpq_2_support,adpq_3_precision,adpq_3_recall,adpq_3_f1-score,adpq_3_support,accuracy,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support
0,LogisticRegression,0.428571,0.2,0.272727,15,0.469388,0.613333,0.531792,75,0.454545,0.357143,0.4,70,0.4625,0.450835,0.390159,0.401506,160,0.459068,0.4625,0.449846,160
1,RidgeClassifier,0.181818,0.133333,0.153846,15,0.471264,0.546667,0.506173,75,0.451613,0.4,0.424242,70,0.44375,0.368232,0.36,0.36142,160,0.435531,0.44375,0.437298,160
2,RandomForestClassifier,0.0,0.0,0.0,15,0.471698,0.666667,0.552486,75,0.461538,0.342857,0.393443,70,0.4625,0.311079,0.336508,0.31531,160,0.423032,0.4625,0.431109,160
3,SVC,0.0,0.0,0.0,15,0.482517,0.92,0.633028,75,0.625,0.142857,0.232558,70,0.49375,0.369172,0.354286,0.288529,160,0.499618,0.49375,0.398476,160
4,DecisionTreeClassifier,0.125,0.133333,0.129032,15,0.47619,0.533333,0.503145,75,0.4,0.342857,0.369231,70,0.4125,0.33373,0.336508,0.333803,160,0.409933,0.4125,0.409484,160
5,BernoulliNB,0.285714,0.133333,0.181818,15,0.417582,0.506667,0.457831,75,0.354839,0.314286,0.333333,70,0.3875,0.352712,0.318095,0.324328,160,0.377769,0.3875,0.377487,160


Пока что лучший результат - Inspector

### Inspector + TF-IDF

In [None]:
X_inspector = inspector_df

x_text = questions["question"]
text_train, text_test = train_test_split(x_text, random_state=42)

vec = TfidfVectorizer().fit(text_train)

X_tfidf_train = vec.transform(text_train)
X_tfidf_test = vec.transform(text_test)

SVD = TruncatedSVD(n_components=408).fit(X_tfidf_train)

X_tfidf_train = SVD.transform(X_tfidf_train)
X_tfidf_test = SVD.transform(X_tfidf_test)

X_inspector_train, X_inspector_test = train_test_split(X_inspector, random_state=42)

X_train = np.concatenate((X_tfidf_train, X_inspector_train), axis=1)
X_test = np.concatenate((X_tfidf_test, X_inspector_test), axis=1)

y = questions["category"]

y_train, y_test = train_test_split(y, random_state=42)

In [None]:
SVD.explained_variance_ratio_.sum()

0.9747853828975686

In [None]:
clfs = [
        LogisticRegression, RidgeClassifier,
        RandomForestClassifier, SVC,
        DecisionTreeClassifier, BernoulliNB
]

result = []

for clf in clfs:
  try:
    scorer = clf(random_state=42).fit(X_train, y_train)
  except:
    scorer = clf().fit(X_train, y_train)
  predicted = scorer.predict(X_test)
  report = classification_report(y_test, predicted, output_dict=True)
  entry = ({
      'algorithm': clf.__name__,
  })

  for key, val in report.items():
    if type(val) == dict:
      for k, v in val.items():
        entry[f"{key}_{k}"] = v
    else:
      entry[key] = val
  
  result.append(entry)
result = pd.DataFrame(result)

  'precision', 'predicted', average, warn_for)


In [None]:
result

Unnamed: 0,algorithm,adpq_1_precision,adpq_1_recall,adpq_1_f1-score,adpq_1_support,adpq_2_precision,adpq_2_recall,adpq_2_f1-score,adpq_2_support,adpq_3_precision,adpq_3_recall,adpq_3_f1-score,adpq_3_support,accuracy,macro avg_precision,macro avg_recall,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support
0,LogisticRegression,0.0,0.0,0.0,15,0.451923,0.626667,0.52514,75,0.418182,0.328571,0.368,70,0.4375,0.290035,0.318413,0.297713,160,0.394793,0.4375,0.407159,160
1,RidgeClassifier,0.333333,0.066667,0.111111,15,0.447917,0.573333,0.502924,75,0.442623,0.385714,0.412214,70,0.44375,0.407958,0.341905,0.342083,160,0.434858,0.44375,0.426506,160
2,RandomForestClassifier,0.0,0.0,0.0,15,0.464912,0.706667,0.560847,75,0.409091,0.257143,0.315789,70,0.44375,0.291334,0.32127,0.292212,160,0.396905,0.44375,0.401055,160
3,SVC,0.0,0.0,0.0,15,0.482517,0.92,0.633028,75,0.647059,0.157143,0.252874,70,0.5,0.376525,0.359048,0.2953,160,0.509268,0.5,0.407364,160
4,DecisionTreeClassifier,0.095238,0.133333,0.111111,15,0.452381,0.506667,0.477987,75,0.436364,0.342857,0.384,70,0.4,0.327994,0.327619,0.324366,160,0.411891,0.4,0.402473,160
5,BernoulliNB,0.166667,0.133333,0.148148,15,0.512821,0.533333,0.522876,75,0.457143,0.457143,0.457143,70,0.4625,0.378877,0.374603,0.376056,160,0.45601,0.4625,0.458987,160


## Saving Best Models

In [None]:
os.chdir('..')

In [None]:
os.chdir('..')

In [None]:
y = questions["category"]

In [None]:
os.listdir()

['.config',
 'LIST',
 'meta.json',
 'README',
 'drive',
 'model.bin',
 'model.txt',
 'lab_inspector',
 '40.zip',
 'sample_data']

In [None]:
with open('drive/My Drive/QuestionLevelPrediction/BertEmbedder.pickle','wb') as outp:
  pickle.dump(bert_embedder, outp)

In [None]:
with open('drive/My Drive/QuestionLevelPrediction/GPT2Embedder.pickle','wb') as outp:
  pickle.dump(gpt2_embedder, outp)

In [None]:
X_bert = bert_embedder.process_sample(questions['question'].tolist(), flatten_method='average')

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/160 [00:00<?, ?it/s]

In [None]:
X_gpt2 = gpt2_embedder.process_sample(questions['question'].tolist(), flatten_method='average')

In [None]:
bernoulli_bert = BernoulliNB().fit(X_bert, y)
bernoulli_gpt2 = BernoulliNB().fit(X_gpt2, y)

In [None]:
bert_tree = DecisionTreeClassifier(random_state=42).fit(X_bert, y)

In [None]:
with open('drive/My Drive/QuestionLevelPrediction/BernoulliBERT.pickle','wb') as outp:
  pickle.dump(bernoulli_bert, outp)

with open('drive/My Drive/QuestionLevelPrediction/BernoulliGPT2.pickle','wb') as outp:
  pickle.dump(bernoulli_gpt2, outp)

In [None]:
with open('drive/MyDrive/QuestionLevelPrediction/TreeBERT.pickle','wb') as outp:
  pickle.dump(bert_tree, outp)

In [None]:
inspector_tree = DecisionTreeClassifier(random_state=42).fit(inspector_df, y)

In [None]:
with open('drive/My Drive/QuestionLevelPrediction/InspectorTree.pickle','wb') as outp:
  pickle.dump(inspector_tree, outp)

## Derived features