<a href="https://colab.research.google.com/github/oserikov/few-shots-exeperiments/blob/master/few_shot_experiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# experiments with the [paper](https://arxiv.org/pdf/1804.02063.pdf)

paper title: Few-Shot Text Classification with Pre-Trained Word Embeddings and a Human in the Loop

[paper code](https://github.com/katbailey/few-shot-text-classification/blob/master/Few-Shot-Text-Classification.ipynb)

the cells that require involvement are marked with the gear ⚙️

## preliminary set-up
install and import modules, download some data

⏳ took ~5 minutes for me

### install prerequisites

In [1]:
!pip install --progress-bar off --quiet gluonnlp | grep -v -P "\s*Building wheel\.+done\s*"
!pip install --progress-bar off --quiet --no-dependencies mxnet
!pip install --progress-bar off --quiet --no-dependencies bert-embedding

[?25l
[?25h

In [2]:
!git clone --quiet https://github.com/katbailey/few-shot-text-classification.git
%cd -q few-shot-text-classification
!git submodule --quiet init
!git submodule --quiet update
%cd -q SIF/
!pip install --progress-bar off --quiet -r requirements.txt | grep -v -P "\s*Building wheel\.+done\s*"
%cd ../
!sed -i -e "s|iteritems()|items()|g" SIF/src/data_io.py
!sed -i -e "s|xrange|range|g" SIF/src/data_io.py
!sed -i -e "s|xrange|range|g" SIF/src/SIF_embedding.py

/content/few-shot-text-classification


### imports
(+ logging setup)

In [0]:
import os
import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns
import mxnet as mx
from bert_embedding import BertEmbedding
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
from sklearn import preprocessing

import en_core_web_sm
# nlp = en_core_web_sm.load()
import spacy
from spacy.lang.en import English
from spacy import displacy

from IPython.display import HTML
import logging

from sklearn.datasets import fetch_20newsgroups
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from operator import itemgetter
from itertools import cycle, islice
import pandas as pd
import numpy as np
import itertools



import utils
import sif_embedding_wrapper

In [0]:
tf.logging.set_verbosity(tf.logging.ERROR)

### download some pretrained models

In [0]:
!wget --quiet https://dl.fbaipublicfiles.com/fasttext/vectors-english/wiki-news-300d-1M.vec.zip
!unzip -qq  wiki-news-300d-1M.vec.zip

!wget --quiet http://nlp.stanford.edu/data/glove.6B.zip
!unzip -qq glove.6B.zip

### download the paper-reproduction-data
(downloader func definition)

In [0]:
def create_dataset_for_newsgroup_pair(category_pair):
    newsgroups_train = fetch_20newsgroups(subset='train', categories=category_pair, remove=('headers', 'footers', 'quotes'))
    docs = {}
    for i,text in enumerate(newsgroups_train.data):
        doc_id = str(i+1)
        docs[doc_id] = {
            "text": text.strip().strip('"'),
            "category_ind": newsgroups_train.target[i]
        }
    all_doc_ids = sorted(list(docs.keys()))
    df = pd.DataFrame({"text": [docs[d]["text"] for d in all_doc_ids], 
                       "category_ind": [docs[d]["category_ind"] for d in all_doc_ids], 
                       "doc_id": [d for d in all_doc_ids]})
    labels = []
    for i in df["category_ind"]:
        parts = newsgroups_train.target_names[i].split(".")
        if parts[-1] == "misc":
            labels.append(parts[-2])
        else:
            labels.append(parts[-1])
    df["label"] = labels
    categories = list(df["label"].unique())
    text_df = pd.DataFrame({"doc_id": df["doc_id"], "text": df["text"]})
    truth_df = pd.DataFrame({"doc_id": df["doc_id"], "gt": df["label"]})
    truth_dict = {str(rec["doc_id"]): rec["gt"] for rec in truth_df.to_dict(orient="records")}
    return text_df, truth_dict, categories


## our experiments logic

### ⚙️download our dataset

In [7]:
from google.colab import files
uploaded = files.upload()

DATASET_FN = list(uploaded.keys())[0]  #"clf_df.tsv"

Saving clf_df.tsv to clf_df.tsv


### various encoders definitions
to be able compare them

In [0]:
class UNIVERSAL_ENCODER:
    module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" 

    # "https://tfhub.dev/google/universal-sentence-encoder-large/3"
    def __init__(self):
        print("BEFORE __init__")
        self.embed = hub.Module(self.module_url)
        self.session = tf.Session()
        self.session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        print("AFTER __init__")
        
    def embed_sentences(self, sentences_list):
        print("BEFORE prediction")
        sentences_embeddings_ndarray = self.session.run(self.embed(sentences_list))
        print("AFTER prediciton")
        sentences_embeddings_list = np.array(sentences_embeddings_ndarray).tolist()
        return sentences_embeddings_list
    
    def embed_words(self, words_list):
        return self.embed_sentences(words_list)


class BERT_ENCODER:

    def __init__(self):
        print("BEFORE __init__")
        ctx = mx.cpu()  # mx.gpu(0)
        self.bert_embedding = BertEmbedding(ctx=ctx)
        print("AFTER __init__")
        
    def embed_sentences(self, sentences_list):
        print("BEFORE prediction")
        result = self.bert_embedding(sentences_list)
        
        print("AFTER prediciton")
        sentences_embeddings_list = [np.mean(elem[1], axis=0).tolist() for elem in result] 
        return sentences_embeddings_list


class ELMO_ENCODER:
    module_url = "https://tfhub.dev/google/elmo/2"

    def __init__(self):
        print("BEFORE __init__")
        self.embed = hub.Module(self.module_url)
        self.session = tf.Session()
        self.session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        print("AFTER __init__")
        
    def embed_sentences(self, sentences_list):
        print("BEFORE prediction")
        sentences_embeddings_ndarray = self.session.run(self.embed(sentences_list, 
                                                                   signature="default",
                                                                   as_dict=True)["elmo"])
        
        print("AFTER prediciton")
        
        sentences_embeddings_list = [np.mean(elem, axis=0).tolist() for elem in sentences_embeddings_ndarray]
        return sentences_embeddings_list


class GLOVE_ENCODER:
    def __init__(self):
        print("BEFORE __init__")
        self.words, self.embs, self.weight4ind = sif_embedding_wrapper.load_embeddings(
                                                    "glove.6B.300d.txt", 
                                                    "SIF/auxiliary_data/enwiki_vocab_min200.txt")
        print("AFTER __init__")
        
    def embed_sentences(self, sentences_list):
        print("BEFORE prediction")
        result = sif_embedding_wrapper.sentences2vecs(sentences_list, 
                                                      self.embs, self.words, self.weight4ind)
        
        print("AFTER prediciton")
        # TODO
        sentences_embeddings_list = result.tolist()
        return sentences_embeddings_list

    
class W2V_ENCODER:
    def __init__(self):
        print("BEFORE __init__")
        self.words, self.embs, self.weight4ind = sif_embedding_wrapper.load_embeddings(
                                                    "wiki-news-300d-1M.vec", 
                                                    "SIF/auxiliary_data/enwiki_vocab_min200.txt",
                                                    word2vec=True)
        print("AFTER __init__")
        
    def embed_sentences(self, sentences_list):
        print("BEFORE prediction")
        result = sif_embedding_wrapper.sentences2vecs(sentences_list, 
                                                      self.embs, self.words, self.weight4ind)
        
        print("AFTER prediciton")
        # TODO
        sentences_embeddings_list = result.tolist()
        return sentences_embeddings_list

### ⚙️choose the encoder

In [29]:
#@title choose the encoder
#@markdown defaults to paper-default GloVe averaged over sentence tokens

encoder = W2V_ENCODER() #@param ["UNIVERSAL_ENCODER()", "BERT_ENCODER()","ELMO_ENCODER()","GLOVE_ENCODER()","W2V_ENCODER()"]{type:"raw"}


BEFORE __init__


  word_embedding_df = pd.read_table(wordfile, delim_whitespace=True, index_col=0, header=None, quoting=csv.QUOTE_NONE, skiprows=1)


AFTER __init__


### initialize variables with our data

In [0]:
df = pd.read_csv(DATASET_FN, sep='\t', header=0)
df["doc_id"] = np.arange(len(df))
df["doc_id"] = df["doc_id"].astype(str)

text_df = pd.DataFrame({"doc_id": df["doc_id"], "text": df["text"]})
truth_df = pd.DataFrame({"doc_id": df["doc_id"], "gt": df["label"]})
truth_dict = {str(rec["doc_id"]): rec["gt"] for rec in truth_df.to_dict(orient="records")}

gold_df = df[df["is_gold"] == True]
gold_dict = gold_df.groupby('label').agg({'doc_id':list}).to_dict()['doc_id']


sentences = df["text"].tolist()

### ⚙️calculate embeddings

In [30]:
# RERUN THIS EVERY TIME YOU CHANGE THE ENCODER

sentences_embedded = encoder.embed_sentences(sentences)

assert len(sentences) == len(sentences_embedded)

df["vector"] = pd.Series(list(sentences_embedded)) 

BEFORE prediction
AFTER prediciton


### initialize variables with paper-reproduction-needed values
(uncomment all the here to use)

In [0]:
# PAPER DATA USED TO REPRODUCE
# df, truth_dict, categories = create_dataset_for_newsgroup_pair(["rec.autos","rec.sport.baseball"]) # TODO
# sentences = df["text"]
# sentences_embedded = encoder.embed_sentences(sentences)

# assert len(sentences) == len(sentences_embedded)
# gold_dict =  {"autos": ["351"], "baseball": ["171"]}
# df["vector"] = pd.Series(list(sentences_embedded))  # вот тут просто список списков эмбеддингов 

### classification & evaluation logic
classifier and accuracy functions defined

(переписать бы, чот многобукаф и малодела)

In [0]:
def auto_classify(docs, category_representators, min_text_length=80):
    
    # Exclude docs deemed too short to classify.
    skip_prediction = list(df[df["text"].map(len) < min_text_length].doc_id)
    
    categories = []
    for repr_cat, repr_texts in category_representators.items():
        categories.append(repr_cat)
        skip_prediction.extend(repr_texts) # No need to predict manually labeled docs
    
    category_vecs = {}
    for cat in categories:
        vectors = np.asarray(list(docs.loc[docs['doc_id'].isin(category_representators[cat])].vector))
        # category vector is the mean of the category representing vectors
        category_vecs[cat] = np.mean(vectors, axis=0)

    predictions = {}
    for idx, row in docs.iterrows():
        if row["doc_id"] in skip_prediction:
            continue        

        winner = max(category_vecs, key = lambda v: cosine_similarity(np.array(row["vector"]).reshape(1, -1), 
                                                                      np.array(category_vecs[v]).reshape(1, -1))
                                                    .flatten()[0])
        
        predictions[row["doc_id"]] = winner
    
    return predictions

def get_accuracy_score(predictions, truth_dict):
    scores = []
    for k,v in predictions.items():
        if v == truth_dict[k]:
            scores.append(1)
        else:
            scores.append(0)
    if len(scores) == 0:
        return 0.0
    return sum(scores) / float(len(scores))



### ⚙️classify then evaluate

In [31]:
preds = auto_classify(df,gold_dict, min_text_length=10)
accuracy = get_accuracy_score(preds, truth_dict)
print(f"accuracy: {accuracy}")

accuracy: 0.8777777777777778
