In [None]:
!pip install pytorch-pretrained-bert

In [1]:
import logging
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
import torch
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM

import matplotlib.pyplot as plt
%matplotlib inline

BERT_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

11/07/2019 12:11:51 - INFO - pytorch_pretrained_bert.modeling -   Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


vocab_file:  https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt


11/07/2019 12:11:52 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /home/sam/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084


In [3]:
import json
import pandas as pd
from os.path import join, exists
from collections import OrderedDict

datafile = "~/Downloads/smerp_labeled_validation.csv"

def load_json(filename: str,filepath: str = '',date_time_tag: str = '',ext: str = ".json",
              show_path: bool = False) -> OrderedDict:
    file_loc = join(filepath,date_time_tag + filename + ext)
    if show_path:
        print("Reading JSON file: [{}]".format(file_loc))
    if exists(join(filepath,date_time_tag + filename + ext)):
        try:
            with open(file_loc, encoding="utf-8") as file:
                json_dict = json.load(file)
                json_dict = OrderedDict(json_dict)
                # json_dict = OrderedDict(json.load(file))
            file.close()
            return json_dict
        except Exception as e:
            print("Could not open file as JSON: [{}]. \n Reason:[{}]".format(file_loc,e))
            with open(file_loc, encoding="utf-8") as file:
                json_dict = str(file)
                json_dict = json.loads(json_dict)
                # json_dict = OrderedDict(json_dict)
            return json_dict
    else:
        print("File does not exist at: [{}]".format(file_loc))
        return False

def json2df(filename = "smerp_labeled_validation",dataset_dir=""):
    catid2cattxt_map = load_json(filename=filename,filepath=dataset_dir)
    idxs, tweets, labels = [], [], []
    for idx in catid2cattxt_map.keys():
        idxs.append(idx)
        tweets.append(catid2cattxt_map[idx]["parsed_tweet"])
        labels.append(catid2cattxt_map[idx]["classes"])

    df = pd.DataFrame.from_dict({"idx"    :idxs,
                                    "tweets"   :tweets,
                                    "labels":labels})
    df = df[~df['tweets'].isna()]
    df.to_csv(path_or_buf=join(dataset_dir,filename + "_df.csv"))
    print("Data shape = {} ".format(df.shape))
    return df

df = json2df()
df.head()

Data shape = (566, 3) 


Unnamed: 0,idx,tweets,labels
0,768744566203768832,earthquake italy 267 dead hundreds injured,[2]
1,768822900812046340,dlusvideonews montepulciano damages earthquake...,[2]
2,768848577225428992,italy quake young girl found alive rubble leas...,"[0, 3]"
3,769138472636469248,new post national disaster italy quake death t...,[2]
4,769113679073800193,number dead italy quake climbs first funerals ...,[2]


In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
spacy_en = spacy.load("en_core_web_sm")
# Isnt required here as already cleaned dataset

In [None]:
## Function to tokenize input text for cleaning
def tokenizer_spacy(input_text: str,remove_stopwords=False):
    input_text = spacy_en(input_text)
    tokens = []
    for token in input_text:
        if remove_stopwords and token.text in STOP_WORDS:
            continue
        tokens.append(token.text)
    return tokens

In [None]:
## Cleaning function
import re
# all_stops = set(STOP_WORDS) | set(string.punctuation)
def clean_txt(txt):
    txt = re.sub('[^A-Za-z0-9 ]+', '', txt)
    txt = [token for token in tokenizer_spacy(txt) if token.lower() not in STOP_WORDS]
    txt = " ".join(txt)
    return txt

In [None]:
## Apply cleaning function to input data
df["tweets"] = df["tweets"].apply(clean_txt)
logger.debug(type(df["tweets"].iloc[0]))
logger.debug(df["tweets"].iloc[0])
logger.debug(df.head())
df["tweets"].str.replace('\s{2,}', ' ')  ## Replaces multiple spaces to single
df.to_csv(datafile,index=None,header=True)
df.head()

In [6]:
## Divide data to train, val and test
train_size, val_size = 0.7, 0.1
train_df = df[0:int(df.shape[0] * train_size)]
print(train_df.shape)
val_df = df[int(df.shape[0] * train_size):int(df.shape[0] * (train_size + val_size))]
print(val_df.shape)
test_df = df[int(df.shape[0] * (train_size + val_size)):int(df.shape[0])]
print(test_df.shape)

train_df.to_csv('data/Tweet_GCN_BERT_train.csv', sep=',', index=False, header=True)
val_df.to_csv('data/Tweet_GCN_BERT_val.csv', sep=',', index=False, header=True)
test_df.to_csv('data/Tweet_GCN_BERT_test.csv', sep=',', index=False, header=True)

(396, 3)
(56, 3)
(114, 3)


In [7]:
def add_spl_tokens(tweets):
    return "[CLS] " + tweets + " [SEP]"

In [8]:
## Prepare input text for BERT and truncate long text larger than 512 tokens
max_seq_length = 32
txts_tokens = []
txts_segment_ids = []
indexed_tokens = []
i=0
max_len = 0
max_tokens = []
for txt in train_df["tweets"]:
    txt_cleaned = add_spl_tokens(txt)
    txt_tokens = BERT_tokenizer.tokenize(add_spl_tokens(txt))
    if len(txt_tokens) > max_seq_length:
        i+=1
        if len(txt_tokens) > max_len: 
            max_len = len(txt_tokens)
            max_tokens = txt_tokens
        txt_tokens = txt_tokens[:max_seq_length]
    txts_tokens.append(txt_tokens)
    txt_segment_ids = [1] * len(txt_tokens)
    txts_segment_ids.append(txt_segment_ids)
    txt_tokens_ids = BERT_tokenizer.convert_tokens_to_ids(txt_tokens)
    indexed_tokens.append(txt_tokens_ids)
    
    padding = [0] * (max_seq_length - len(txt_tokens_ids))
    txt_tokens_ids += padding
#     input_mask += padding
    txt_segment_ids += padding

    assert len(txt_tokens_ids) == max_seq_length
#     assert len(input_mask) == max_seq_length
    assert len(txt_segment_ids) == max_seq_length

In [9]:
print("Documnets with more than {} tokens:[{}]".format(max_seq_length,i))
print("max_len: [{}]".format(max_len))
print("max_tokens: [{}]".format(max_tokens))

Documnets with more than 32 tokens:[2]
max_len: [35]
max_tokens: [['[CLS]', 'earth', '##sc', '##ip', '##ly', '##mun', '##i', '23', '20', 'local', 'time', 'rain', '##ew', '##s', 'reports', '112', 'dead', 'ama', '##tric', '##e', 'alone', '37', 'dead', 'pe', '##sca', '##ra', 'del', 'tr', '##ont', '##o', 'm', '##6', 'italy', 'quake', '[SEP]']]


In [None]:
for tup in zip(txts_tokens[0], indexed_tokens[0]):
    print(tup)

In [None]:
tokens_tensor = torch.tensor([indexed_tokens[0]])
segments_tensors = torch.tensor([txts_segment_ids[0]])
print(tokens_tensor)
print(segments_tensors)

In [10]:
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

11/07/2019 12:12:31 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /home/sam/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba
11/07/2019 12:12:31 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /home/sam/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /tmp/tmpvcq4nnh4
11/07/2019 12:12:35 - INFO - pytorch_pretrained_bert.modeling -   Model config {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "type_vocab_size": 2,
  "vo

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): BertLayerNorm()
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): BertLayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
        (intermediate): BertIntermediate(
          (dense): Lin

In [11]:
all_encoded_layers = []
for txt_tokens_ids,txt_segment_ids in zip(indexed_tokens,txts_segment_ids):
    tokens_tensor = torch.tensor([txt_tokens_ids])
    segments_tensors = torch.tensor([txt_segment_ids])
    assert tokens_tensor.shape == segments_tensors.shape, "shape does not match"
    with torch.no_grad():
        encoded_layers, _ = model(tokens_tensor, segments_tensors)
        all_encoded_layers.append(encoded_layers)
print("Number of documents: [{}]".format(len(all_encoded_layers)))
layer_i = 0

print("Number of layers: [{}]".format(len(all_encoded_layers[layer_i])))
batch_i = 0

print("Number of tokens: [{}]".format(len(all_encoded_layers[layer_i][batch_i])))
token_i = 0

print("Number of tokens: [{}]".format(len(all_encoded_layers[layer_i][batch_i][token_i]),"\n"))

Number of documents: [396]
Number of layers: [12]
Number of tokens: [1]
Number of tokens: [32]


In [None]:
list(tokenizer.vocab.keys())[5000:5020]

In [None]:
token_i = 6
batch_i = 0
layer_i = 6

print(encoded_layers[layer_i][0].shape)

vec = encoded_layers[layer_i][batch_i][token_i]
print(vec)

plt.figure(figsize=(10,10))
plt.hist(vec, bins=200)
plt.show()

In [None]:
# print(len(txts_tokens[1]))
# print(txts_tokens[1])
# print(len(txts_tokens))
# all_embeddings = []
# for batch_i in range(0, len(txts_tokens)):
#     for txt_tokens in txts_tokens:
#         token_embeddings = []

#     #     print(len(txt_tokens))
#         for token_i in range(len(txt_tokens)):
#             hidden_layers = [] 
#             for layer_i in range(len(encoded_layers)):
#     #             print(layer_i,batch_i,token_i)
#     #             print(encoded_layers[layer_i][batch_i].shape)
#                 vec = encoded_layers[layer_i][batch_i][token_i]    
#                 hidden_layers.append(vec)
#             token_embeddings.append(hidden_layers)
#         all_embeddings.append(token_embeddings)

# # Sanity check the dimensions:
# print ("Number of tokens in sequence:", len(token_embeddings))
# print ("Number of layers per token:", len(token_embeddings[0]))
# print ("Number of total embeddings:", len(all_embeddings))

In [None]:
# token_vecs_cat = []
# token_vecs_sum = []

# for token in token_embeddings:
#     cat_vec = torch.cat((token[-1], token[-2], token[-3], token[-4]), 0)
#     token_vecs_cat.append(cat_vec)
#     sum_vec = torch.sum(torch.stack(token)[-4:], 0)
#     token_vecs_sum.append(sum_vec)

# print('Concatenated vector shape is: %d x %d' % (len(token_vecs_cat), len(token_vecs_cat[0])))
# print('Sum vector shape is: %d x %d' % (len(token_vecs_sum), len(token_vecs_sum[0])))

In [12]:
## Generate sentence lavel vector by averaging the second to last hidden layer of each token producing a single 768 length vector.
all_sentences_embedding = []

for batch_i in range(len(txts_tokens)):
    all_sentences_embedding.append(torch.mean(all_encoded_layers[batch_i][11], 1))

print(len(all_sentences_embedding))

print ("Count of sentence embeddings found:",len(all_sentences_embedding))
print ("Single sentence embedding vector of shape:",all_sentences_embedding[0].shape)

396
Count of sentence embeddings found: 396
Single sentence embedding vector of shape: torch.Size([1, 768])


In [None]:
# def knn_lbl_majority(test_features,train_features,train_labels,k=5):

#     from sklearn.neighbors import KNeighborsClassifier

#     NN = KNeighborsClassifier(n_neighbors=k)
#     NN.fit(train_features,train_labels)
#     test_labels = NN.predict(test_features)
#     ## To get probabilities: test_labels_probas = NN.predict_proba(test_features)
#     return test_labels

In [None]:
def neighrest_neighbors(test_features,train_features,n,k=5):
    from sklearn.neighbors import NearestNeighbors

    NN = NearestNeighbors(n_neighbors=k)
    NN.fit(train_features)
    closest_neighbors_from_train = NN.kneighbors(test_features,
    return_distance=False)
    return closest_neighbors_from_train[0:n]

In [None]:
def knn_majority_label(test_features,train_features,train_labels,k=5):
    import numpy as np
    n=2
    n_idxs = neighrest_neighbors(test_features, train_features, n)
    count = np.zeros(4)
    #n is for precision at k
    print(test_features)
    print(train_features)
    print(train_labels)
    for ii in n_idxs:
        count = np.zeros(4)
        for l in range(0,n):
        for i in n_idxs[l]:
            for j in train_labels[i]:
                count[j] = count[j] + 1

    
    print(np.argsort(count)[4-n:])
        
    


# print(test_labels[0:2])

In [None]:
knn_majority_label()