In [1]:
!pip install sentence-transformers pylatexenc

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.7/79.7 KB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
[?25hCollecting pylatexenc
  Downloading pylatexenc-2.10.tar.gz (162 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m162.6/162.6 KB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l- done
Building wheels for collected packages: sentence-transformers, pylatexenc
  Building wheel for sentence-transformers (setup.py) ... [?25l- \ | done
[?25h  Created wheel for sentence-transformers: filename=sentence_transformers-2.2.0-py3-none-any.whl size=120747 sha256=e8c9bb724c640774da4881567e7abeed837909c34428152ac47996ffd29f59ef
  Stored in directory: /root/.cache/pip/wheels/83/c0/df/b6873ab7aac3f2465aa9144b6b4c41c4391cfecc027c8b07e7
  Building wheel for

In [2]:
from transformers import pipeline
import numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import nltk
import string
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import torch

from pylatexenc.latex2text import LatexNodes2Text

In [3]:
tqdm.pandas()
docs_df = pd.read_csv('../input/task-2-data/to_use.csv')
df = docs_df.sample(5000, random_state=42)


def clean_abstract(txt, stp_wrds):
    txt = txt.lower()
    try:
        txt = LatexNodes2Text().latex_to_text(txt)
    except:
        txt = txt.lower()
    sup = word_tokenize(txt, language="english")
    lemmatizer = WordNetLemmatizer()
    sup = [lemmatizer.lemmatize(word) for word in sup]
    ret = []
    for el in sup:
        el.replace('\\', '')
        if el not in stp_wrds and el not in string.punctuation:
            ret.append(el)
    return ' '.join(ret)


stop_words = nltk.corpus.stopwords.words("english")
stop_words += ['we', 'paper', 'new', 'article', "''", "``", "”", 'et', 'al', 'study']
cleaned_abstract = df['abstract'].progress_apply(clean_abstract, args=[stop_words])
df['abstract_uncleaned'] = df['abstract'].copy()
df['abstract'] = cleaned_abstract

  exec(code_obj, self.user_global_ns, self.user_ns)


  0%|          | 0/5000 [00:00<?, ?it/s]

In [4]:
hyp = df['categories'].unique().tolist()
sentences = df['abstract'].tolist()[:100]
torch.cuda.empty_cache()

In [5]:
normal_cats = {'math': 'math',
               'cond-mat': 'condensed matter',
               'math-ph' : 'mathematical physics',
               'astro-ph': 'Astrophysics',
               'cs': 'computer science',
               'econ': 'econometrics',
               'alg-geom' : 'math',
               'funct-an': 'math',
               'eess': 'electrical engineering and systems science',
               'gr-qc': 'general relativity and quantum cosmology',
               'hep-ex': 'high energy physics',
               'hep-lat': 'high energy physics',
               'hep-ph': 'high energy physics',
               'hep-th': 'high energy physics',
               'nlin': 'nonlinear sciences',
               'nucl-ex': 'nuclear experiment',
               'nucl-th': 'nuclear theory',
               'physics': 'physics',
               'q-bio': 'quantitative biology',
               'q-fin': 'quantitative finance',
               'quant-ph': 'quantum physics',
               'stat': 'statistics',
               'q-alg': 'quantitative algebra'}

hyp_ = [normal_cats[el] for el in hyp]

In [6]:
class My_ZH:
    def __init__(self, mdl):
        self.pipe = pipeline("zero-shot-classification", model=mdl)

    @torch.no_grad()
    def predict(self, sent, hyp):
        res = self.pipe(sent, hyp)
        return hyp[np.argmax(np.array(res['scores']))]


    def run_predict(self, sents, hyp):
        ret = []
        for sent in tqdm(sents, desc='Classifying abstracts'):
            ret.append(self.predict(sent, hyp))
        return ret

In [7]:
model = My_ZH('cross-encoder/nli-distilroberta-base')

Downloading:   0%|          | 0.00/701 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/313M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

In [8]:
y_pred = model.run_predict(sentences, hyp)

Classifying abstracts:   0%|          | 0/100 [00:00<?, ?it/s]

In [9]:
y_true = df['categories'].tolist()[:100]

In [10]:
from sklearn.metrics import accuracy_score
accuracy_score(y_true, y_pred)

0.26

In [11]:
for tr, pr in list(zip(y_true, y_pred)):
    print(tr, pr)

math math
math math
math math
q-bio math
cond-mat math
gr-qc math
astro-ph math
physics math
astro-ph math
math math
cs math
nucl-th math
cs math
cs math
hep-ph math
quant-ph math
hep-ph math
cs math
math math
stat math
cs math
cs math
cond-mat math
nlin math
cond-mat math
math math
cs math
stat math
cond-mat math
cs math
cs math
math math
nucl-th math
hep-ph math
cond-mat math
cs math
math math
cs math
cs math
nlin math
astro-ph math
cond-mat math
cs math
cs math
astro-ph math
cs math
physics math
cs math
math math
astro-ph math
cs math
quant-ph math
cs math
physics math
cs math
math math
math math
hep-th math
quant-ph math
math math
cs math
cs math
cs math
physics math
math math
math math
math math
cs math
cond-mat math
gr-qc math
math math
math math
astro-ph math
cs math
cs math
hep-th math
cond-mat math
cs math
cond-mat math
math math
physics math
cond-mat math
cs math
cs math
math math
cs math
cs math
math math
cs math
physics math
cs math
math math
math math
math math
math math
m