In [None]:
#!pip install gensim


In [18]:

import os
from tqdm.autonotebook import tqdm

from transformers import BertTokenizer, BertModel, GPT2Tokenizer, GPT2Model
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import KeyedVectors
import numpy as np
import pandas as pd

import config

In [19]:
def get_data_cfg(targeted_data='google'):
    data_path, label_map = None, None
    if targeted_data == 'google':
        data_path = config.googel_news
        label_map = config.goole_label_map
    elif targeted_data == 'tweets':
        data_path = config.tweets
        label_map = config.tweet_label_map
    elif targeted_data == 'stackOverflow':
        data_path = config.stack_overflow
        label_map = config.stackOverflow_label_map
    else:
        raise ValueError(
            'targeted_data must be google ,stackOverflow or tweets')
    return data_path, label_map

In [20]:
# BERT
class BERT(object):
    def __init__(self, model_name="bert-base-uncased",
                 cache_dir="./assets/bert"):
        print(f"init BERT: cache_dir={cache_dir}, model_name={model_name}")
        self.tokenizer = BertTokenizer.from_pretrained(
            model_name, cache_dir=cache_dir)
        self.model = BertModel.from_pretrained(
            model_name, cache_dir=cache_dir)

    def embed(self, text):
        inputs = self.tokenizer(text, return_tensors='pt',
                                truncation=True, padding=True)
        outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()


bert = BERT(cache_dir=os.path.join(config.ROOT_DATA, "assets", "bert"))

init BERT: cache_dir=/Users/zhouyf/Documents/data/majid/drive/MyDrive/project2/data/assets/bert, model_name=bert-base-uncased


In [21]:
# GPT
class GPT(object):
    def __init__(self, model_name="gpt2",
                 cache_dir="./assets/gpt2",
                 special_tokens={'pad_token': '[PAD]'}):
        print(f"init GPT: cache_dir={cache_dir}, model_name={model_name}")
        self.tokenizer = GPT2Tokenizer.from_pretrained(
            model_name, cache_dir=cache_dir)
        self.tokenizer.add_special_tokens(special_tokens)
        self.model = GPT2Model.from_pretrained(
            model_name, cache_dir=cache_dir)

    def embed(self, text):
        inputs = self.tokenizer(text, return_tensors='pt',
                                truncation=True, padding=True)
        outputs = self.model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()


gpt = GPT(cache_dir=os.path.join(config.ROOT_DATA, "assets", "gpt2"))

init GPT: cache_dir=/Users/zhouyf/Documents/data/majid/drive/MyDrive/project2/data/assets/gpt2, model_name=gpt2


In [22]:
# FastText
class FastText(object):
    def __init__(self, model_name="crawl-300d-2M",  cache_dir="./assets/fast_text"):
        print(f"init FastText: cache_dir={cache_dir}, model_name={model_name}")
        self.model = KeyedVectors.load_word2vec_format(
            os.path.join(cache_dir, f"{model_name}.vec"), binary=False)

    def embed(self, text):
        vectors = [self.model[word]
                   for word in text.split() if word in self.model]
        if vectors:
            return np.mean(vectors, axis=0)
        else:
            return np.zeros(self.model.vector_size)


fast_text = FastText(cache_dir=os.path.join(
    config.ROOT_DATA, "assets", "fast_text"))

init FastText: cache_dir=/Users/zhouyf/Documents/data/majid/drive/MyDrive/project2/data/assets/fast_text, model_name=crawl-300d-2M


In [24]:
# GloVe
class GloVe(object):
    def __init__(self, model_name="glove.42B.300d",  cache_dir="./assets/glove"):
        print(f"init GloVe: cache_dir={cache_dir}, model_name={model_name}")
        self.model = KeyedVectors.load_word2vec_format(
            os.path.join(cache_dir, f"{model_name}.txt"), binary=False, no_header=True)

    def embed(self, text):
        vectors = [self.model[word]
                   for word in text.split() if word in self.model]
        if vectors:
            return np.mean(vectors, axis=0)
        else:
            return np.zeros(self.model.vector_size)


glove = GloVe(cache_dir=os.path.join(config.ROOT_DATA, "assets", "glove"))

init GloVe: cache_dir=/Users/zhouyf/Documents/data/majid/drive/MyDrive/project2/data/assets/glove, model_name=glove.42B.300d


In [25]:
# TF-IDF
class TFIDF(object):
    def __init__(self) -> None:
        print("init TFIDF")
        self.vectorizer = TfidfVectorizer()

    def fit(self, corpus: list):
        self.corpus = corpus
        self.tfidf_vectors = self.vectorizer.fit_transform(
            self.corpus).toarray()
        print(f"********TFIDF:\n\tcorpus length: {self.corpus.__len__()}"
              + f"\n\tvectors shape: {self.tfidf_vectors.shape}")

    def embed(self, text):
        return self.tfidf_vectors[self.corpus.index(text)]


tfidf = TFIDF()

init TFIDF


In [30]:
def concatenate_embeddings(text):  
    # return np.concatenate((
    #     bert.embed(text), gpt.embed(text), fast_text.embed(
    #         text), glove.embed(text), tfidf.embed(text)
    # ))
    bert_embed_data = bert.embed(text)
    gpt_embed_data = gpt.embed(text)
    fast_text_embed_data = fast_text.embed(text)
    glove_embed_data = glove.embed(text)
    tfidf_embed_data = tfidf.embed(text)
    print(f"\t\tbert embed data sample shape: {bert_embed_data.shape}")
    print(f"\t\tgpt embed data sample shape: {gpt_embed_data.shape}")
    print(f"\t\tfast_text embed data sample shape: {fast_text_embed_data.shape}")
    print(f"\t\tglove embed data sample shape: {glove_embed_data.shape}")
    print(f"\t\ttfidf embed data sample shape: {tfidf_embed_data.shape}")
    return np.concatenate((
        bert_embed_data, gpt_embed_data, fast_text_embed_data, glove_embed_data, tfidf_embed_data
    ))

In [31]:
tqdm.pandas()
embed_path = config.embed_path
if not os.path.exists(embed_path):
    os.makedirs(embed_path)
    
# for target in tqdm(["google", "stackOverflow", "tweets"]):
for target in tqdm(["tweets"]):
    print("-"*30 + target + "-"*80)
    df = pd.read_json(get_data_cfg(target)[0])[:-1]
    print(f"{df[:3]}\n\n{df[-3:]}")
    corpus = df["clean_text"].to_list()
    print(f"corpus({corpus.__len__()}):{corpus[:5]}")
    tfidf.fit(corpus)
    # Apply concatenated embeddings to each row in the DataFrame
    print("================================")
    df["concatenated_vector"] = df["clean_text"].progress_apply(lambda x: concatenate_embeddings(x))
    print(f"concatenated_vector sample shape: {df['concatenated_vector'][0].shape}")
    print("================================")
    
    out_file = os.path.join(embed_path, f"{target}_data_embedded.json")
    df.to_json(out_file, indent=2)

  0%|          | 0/1 [00:00<?, ?it/s]

------------------------------tweets--------------------------------------------------------------------------------
                                                text label  \
0           brain fluid buildup delay giffords rehab    37   
1  trailer talk week movie rite mechanic week opp...    14   
2  rnc appoints chairman tampa convention effort ...   100   

                                          clean_text  
0           brain fluid buildup delay giffords rehab  
1  trailer talk week movie rite mechanic week opp...  
2  rnc appoints chairman tampa convention effort ...  

                                                   text label  \
2469  yemeni protester urged president ali abdullah ...    79   
2470  indian navy coast guard rescue thai vessel pir...   107   
2471                  christie prof adept social medium    45   

                                             clean_text  
2469  yemeni protester urged president ali abdullah ...  
2470  indian navy coast guard rescue

  0%|          | 0/2472 [00:00<?, ?it/s]

		bert embed data sample shape: (768,)
		gpt embed data sample shape: (768,)
		fast_text embed data sample shape: (300,)
		glove embed data sample shape: (300,)
		tfidf embed data sample shape: (5058,)
		bert embed data sample shape: (768,)
		gpt embed data sample shape: (768,)
		fast_text embed data sample shape: (300,)
		glove embed data sample shape: (300,)
		tfidf embed data sample shape: (5058,)
		bert embed data sample shape: (768,)
		gpt embed data sample shape: (768,)
		fast_text embed data sample shape: (300,)
		glove embed data sample shape: (300,)
		tfidf embed data sample shape: (5058,)
		bert embed data sample shape: (768,)
		gpt embed data sample shape: (768,)
		fast_text embed data sample shape: (300,)
		glove embed data sample shape: (300,)
		tfidf embed data sample shape: (5058,)
		bert embed data sample shape: (768,)
		gpt embed data sample shape: (768,)
		fast_text embed data sample shape: (300,)
		glove embed data sample shape: (300,)
		tfidf embed data sample shape