# Import Packages and Initialize DeBERTa, RoBERTa, BERT, GloVe, and Word2Vec

In [None]:
from BertEmbeddings import BertEmbeddings
import pandas as pd
import numpy as np
from gensim.models import KeyedVectors
from gensim.test.utils import datapath
from transformers import RobertaTokenizer, TFRobertaModel
from transformers import DebertaV2Tokenizer, TFDebertaV2Model
import tensorflow as tf
import nltk

In [None]:
bert_embeddings = BertEmbeddings(model_name = 'bert-large-uncased-whole-word-masking')
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-large')
roberta_model = TFRobertaModel.from_pretrained('roberta-large')
deberta_tokenizer = DebertaV2Tokenizer.from_pretrained('kamalkraj/deberta-v2-xlarge')
deberta_model = TFDebertaV2Model.from_pretrained('kamalkraj/deberta-v2-xlarge')

In [None]:
def single_post_Bert_embedding(post):
    word = []
    embedding = []
    summed_embedding = []
    try:
        result = bert_embeddings([post])        
        word = result[0]['tokens']
        #word = np.array(word)
        for key in result[0]['embeddings_map'].keys():
            embedding.append(np.array(result[0]['embeddings_map'][key]))
        #embedding = np.array(embedding)
        summed_embedding = list(np.sum(embedding, axis = 0))
        return word, embedding, summed_embedding
    except:
        #word = np.array(word)
        #embedding = np.array(embedding)
        #summed_embedding = np.array(summed_embedding)
        return word, embedding, summed_embedding
    
def single_post_DeBerta_embedding(post):
    word = []
    embedding = []
    summed_embedding = []
    try:
        inputs = deberta_tokenizer(post, return_tensors="tf")
        outputs = deberta_model(inputs)
        last_hidden_states = outputs.last_hidden_state
        nltk_tokens = nltk.word_tokenize(post)
        result = np.zeros(outputs.last_hidden_state[0].shape[1])    
        word = nltk_tokens
        #word = np.array(word)
        for embeddings_map in outputs.last_hidden_state[0][1:-1]:
            embedding.append(np.array(embeddings_map))
            result = result + embeddings_map
        #embedding = np.array(embedding)
        summed_embedding = list(np.array(result))
        return word, embedding, summed_embedding
    except:
        #word = np.array(word)
        #embedding = np.array(embedding)
        #summed_embedding = np.array(summed_embedding)
        return word, embedding, summed_embedding
    
def single_post_GloVe_embedding(post, embeddings_index):
    word = []
    embedding = []
    summed_embedding = []
    try:
        word = post.split(" ")      
        for i in word:
            embedding_vector = embeddings_index.get(i)
            if embedding_vector is not None:
                embedding.append(embedding_vector)
            else:
                embedding.append(np.zeros(300))
        #embedding = np.array(embedding)
        summed_embedding = list(np.sum(embedding, axis = 0))
        return word, embedding, summed_embedding
    except:
        #word = np.array(word)
        #embedding = np.array(embedding)
        #summed_embedding = np.array(summed_embedding)
        return word, embedding, summed_embedding
    
def single_post_RoBerta_embedding(post):
    word = []
    embedding = []
    summed_embedding = []
    try:
        inputs = roberta_tokenizer(post, return_tensors="tf")
        outputs = roberta_model(inputs)
        last_hidden_states = outputs.last_hidden_state
        nltk_tokens = nltk.word_tokenize(post)
        result = np.zeros(outputs.last_hidden_state[0].shape[1])    
        word = nltk_tokens
        #word = np.array(word)
        for embeddings_map in outputs.last_hidden_state[0][1:-1]:
            embedding.append(np.array(embeddings_map))
            result = result + embeddings_map
        #embedding = np.array(embedding)
        summed_embedding = list(np.array(result))
        return word, embedding, summed_embedding
    except:
        #word = np.array(word)
        #embedding = np.array(embedding)
        #summed_embedding = np.array(summed_embedding)
        return word, embedding, summed_embedding
    
def single_post_Word2Vec_embedding(post, embeddings_index):
    word = []
    embedding = []
    summed_embedding = []
    word = post.split(" ")      
    for i in word:
        try:
            embedding_vector = embeddings_index.get_vector(i)
            embedding.append(embedding_vector)
        except:
            embedding.append(np.zeros(300))
    #embedding = np.array(embedding)
    summed_embedding = list(np.sum(embedding, axis = 0))
    return word, embedding, summed_embedding

In [None]:
def word_embedding(input_cvs_file_name_array, data_clean_type, embedding_method):
    if embedding_method == "Bert":
        for input_cvs_file_name in input_cvs_file_name_array:
            print(embedding_method + " embedding " + input_cvs_file_name.split(".")[0] + data_clean_type + ".csv" + "...")
            data = pd.read_csv(input_cvs_file_name.split(".")[0] + data_clean_type + ".csv")
            words = []
            embeddings = []
            summed_embeddings = []
            problem_index = []
            all_posts = data['cleanedContent'].tolist()
            for i in range(0, len(all_posts)):
                if (i % 10 == 0):
                    print(str(i) + " posts' processing has finished")
                word, embedding, summed_embedding = single_post_Bert_embedding(all_posts[i])
                if (len(word) == 0 and len(embedding) == 0 and len(summed_embedding) == 0):
                    problem_index.append(i)
                words.append(word)
                embeddings.append(embedding)
                summed_embeddings.append(summed_embedding)

            embedded_data = pd.DataFrame(summed_embeddings)
            embedded_data.insert(0, "New Category", data["New category (Active, Constructive, Logistical, Content-Clarification, Note, Poll)"].tolist())
            embedded_data.insert(0, "category", data["category"].tolist())
            embedded_data.insert(0, "totalFollowUpPosts", data["totalFollowUpPosts"].tolist())
            embedded_data.insert(0, "timeCreated", data["timeCreated"].tolist())
            embedded_data.insert(0, "tags", data["tags"].tolist())
            embedded_data.insert(0, "cleanedContent", data["cleanedContent"].tolist())
            embedded_data.insert(0, "title", data["title"].tolist())
            embedded_data.insert(0, "type", data["type"].tolist())
            embedded_data.insert(0, "id", data["Unnamed: 0"].tolist())
            embedded_data = embedded_data.drop(embedded_data.index[problem_index])

            output_cvs_file_name = input_cvs_file_name.split(".")[0] + data_clean_type + "(Bert Embedded)" 
            embedded_data.to_csv(output_cvs_file_name + '.csv', index=False)
        
    elif embedding_method == "DeBerta":
        for input_cvs_file_name in input_cvs_file_name_array:
            print(embedding_method + " embedding " + input_cvs_file_name.split(".")[0] + data_clean_type + ".csv" + "...")
            data = pd.read_csv(input_cvs_file_name.split(".")[0] + data_clean_type + ".csv")
            words = []
            embeddings = []
            summed_embeddings = []
            problem_index = []
            all_posts = data['cleanedContent'].tolist()
            for i in range(0, len(all_posts)):
                if (i % 10 == 0):
                    print(str(i) + " posts' processing has finished")
                word, embedding, summed_embedding = single_post_DeBerta_embedding(all_posts[i])
                if (len(word) == 0 and len(embedding) == 0 and len(summed_embedding) == 0):
                    problem_index.append(i)
                words.append(word)
                embeddings.append(embedding)
                summed_embeddings.append(summed_embedding)

            embedded_data = pd.DataFrame(summed_embeddings)
            embedded_data.insert(0, "New Category", data["New category (Active, Constructive, Logistical, Content-Clarification, Note, Poll)"].tolist())
            embedded_data.insert(0, "category", data["category"].tolist())
            embedded_data.insert(0, "totalFollowUpPosts", data["totalFollowUpPosts"].tolist())
            embedded_data.insert(0, "timeCreated", data["timeCreated"].tolist())
            embedded_data.insert(0, "tags", data["tags"].tolist())
            embedded_data.insert(0, "cleanedContent", data["cleanedContent"].tolist())
            embedded_data.insert(0, "title", data["title"].tolist())
            embedded_data.insert(0, "type", data["type"].tolist())
            embedded_data.insert(0, "id", data["Unnamed: 0"].tolist())
            embedded_data = embedded_data.drop(embedded_data.index[problem_index])

            output_cvs_file_name = input_cvs_file_name.split(".")[0] + data_clean_type + "(DeBerta Embedded)" 
            embedded_data.to_csv(output_cvs_file_name + '.csv', index=False)
    
    elif embedding_method == "GloVe":
        Embedding_Dim = 300 # Dimension of GloVe-embedding
        Embedding_File = '/Users/gubow/COMP 691H/Find Duplicates Project/glove.840B.300d.txt'
        # Create word embedding dictionary from 'glove.840B.300d.txt'
        print('Creating GloVe word embedding dictionary...')

        embeddings_index = {}
        f = open(Embedding_File, encoding='utf-8')

        for line in f:
            values = line.split()
            # word = values[0]
            word = ''.join(values[:-300])   
            coefs = np.asarray(values[-300:], dtype='float32')
            embeddings_index[word] = coefs
        f.close()

        print('Found {} word vectors of glove.'.format(len(embeddings_index)))
        for input_cvs_file_name in input_cvs_file_name_array:
            print(embedding_method + " embedding " + input_cvs_file_name + "...")
            
            
            data = pd.read_csv(input_cvs_file_name.split(".")[0] + data_clean_type + ".csv")
            words = []
            embeddings = []
            summed_embeddings = []
            problem_index = []
            all_posts = data['cleanedContent'].tolist()
            for i in range(0, len(all_posts)):
                if (i % 10 == 0):
                    print(str(i) + " posts' processing has finished")

                word, embedding, summed_embedding = single_post_GloVe_embedding(all_posts[i], embeddings_index)
                if (len(word) == 0 and len(embedding) == 0 and len(summed_embedding) == 0):
                    problem_index.append(i)
                words.append(word)
                embeddings.append(embedding)
                summed_embeddings.append(summed_embedding)

            embedded_data = pd.DataFrame(summed_embeddings)
            embedded_data.insert(0, "New Category", data["New category (Active, Constructive, Logistical, Content-Clarification, Note, Poll)"].tolist())
            embedded_data.insert(0, "category", data["category"].tolist())
            embedded_data.insert(0, "totalFollowUpPosts", data["totalFollowUpPosts"].tolist())
            embedded_data.insert(0, "timeCreated", data["timeCreated"].tolist())
            embedded_data.insert(0, "tags", data["tags"].tolist())
            embedded_data.insert(0, "cleanedContent", data["cleanedContent"].tolist())
            embedded_data.insert(0, "title", data["title"].tolist())
            embedded_data.insert(0, "type", data["type"].tolist())
            embedded_data.insert(0, "id", data["Unnamed: 0"].tolist())
            embedded_data = embedded_data.drop(embedded_data.index[problem_index])

            output_cvs_file_name = input_cvs_file_name.split(".")[0] + data_clean_type + "(GloVe Embedded)" 
            embedded_data.to_csv(output_cvs_file_name + '.csv', index=False)
            
    elif embedding_method == "RoBerta":
        for input_cvs_file_name in input_cvs_file_name_array:
            print(embedding_method + " embedding " + input_cvs_file_name.split(".")[0] + data_clean_type + ".csv" + "...")
            data = pd.read_csv(input_cvs_file_name.split(".")[0] + data_clean_type + ".csv")
            words = []
            embeddings = []
            summed_embeddings = []
            problem_index = []
            all_posts = data['cleanedContent'].tolist()
            for i in range(0, len(all_posts)):
                if (i % 10 == 0):
                    print(str(i) + " posts' processing has finished")
                word, embedding, summed_embedding = single_post_RoBerta_embedding(all_posts[i])
                if (len(word) == 0 and len(embedding) == 0 and len(summed_embedding) == 0):
                    problem_index.append(i)
                words.append(word)
                embeddings.append(embedding)
                summed_embeddings.append(summed_embedding)

            embedded_data = pd.DataFrame(summed_embeddings)
            embedded_data.insert(0, "New Category", data["New category (Active, Constructive, Logistical, Content-Clarification, Note, Poll)"].tolist())
            embedded_data.insert(0, "category", data["category"].tolist())
            embedded_data.insert(0, "totalFollowUpPosts", data["totalFollowUpPosts"].tolist())
            embedded_data.insert(0, "timeCreated", data["timeCreated"].tolist())
            embedded_data.insert(0, "tags", data["tags"].tolist())
            embedded_data.insert(0, "cleanedContent", data["cleanedContent"].tolist())
            embedded_data.insert(0, "title", data["title"].tolist())
            embedded_data.insert(0, "type", data["type"].tolist())
            embedded_data.insert(0, "id", data["Unnamed: 0"].tolist())
            embedded_data = embedded_data.drop(embedded_data.index[problem_index])

            output_cvs_file_name = input_cvs_file_name.split(".")[0] + data_clean_type + "(RoBerta Embedded)" 
            embedded_data.to_csv(output_cvs_file_name + '.csv', index=False)

    elif embedding_method == "Word2Vec":
        Embedding_Dim = 300 # Dimension of Word2Vec-embedding
        Embedding_File = '/Users/gubow/COMP 691H/Find Duplicates Project/GoogleNews-vectors-negative300.bin'
        print('Creating Word2Vec word embedding dictionary...')

        word2vec = KeyedVectors.load_word2vec_format(datapath(Embedding_File), binary=True)
        print('Found %s word vectors of word2vec' % len(word2vec))
        for input_cvs_file_name in input_cvs_file_name_array:
            print(embedding_method + " embedding " + input_cvs_file_name + "...")
            
            data = pd.read_csv(input_cvs_file_name.split(".")[0] + data_clean_type + ".csv")
            words = []
            embeddings = []
            summed_embeddings = []
            problem_index = []
            all_posts = data['cleanedContent'].tolist()
            for i in range(0, len(all_posts)):
                if (i % 10 == 0):
                    print(str(i) + " posts' processing has finished")
                word, embedding, summed_embedding = single_post_Word2Vec_embedding(all_posts[i], word2vec)
                if (len(word) == 0 and len(embedding) == 0 and len(summed_embedding) == 0):
                    problem_index.append(i)
                words.append(word)
                embeddings.append(embedding)
                summed_embeddings.append(summed_embedding)

            embedded_data = pd.DataFrame(summed_embeddings)
            embedded_data.insert(0, "New Category", data["New category (Active, Constructive, Logistical, Content-Clarification, Note, Poll)"].tolist())
            embedded_data.insert(0, "category", data["category"].tolist())
            embedded_data.insert(0, "totalFollowUpPosts", data["totalFollowUpPosts"].tolist())
            embedded_data.insert(0, "timeCreated", data["timeCreated"].tolist())
            embedded_data.insert(0, "tags", data["tags"].tolist())
            embedded_data.insert(0, "cleanedContent", data["cleanedContent"].tolist())
            embedded_data.insert(0, "title", data["title"].tolist())
            embedded_data.insert(0, "type", data["type"].tolist())
            embedded_data.insert(0, "id", data["Unnamed: 0"].tolist())
            embedded_data = embedded_data.drop(embedded_data.index[problem_index])

            output_cvs_file_name = input_cvs_file_name.split(".")[0] + data_clean_type + "(Word2Vec Embedded)" 
            embedded_data.to_csv(output_cvs_file_name + '.csv', index=False)

    else:
        print("Undefined embedding method")
        return

In [None]:
# data_clean_type:
# 1. (cleaned)
# 2. (hyper_cleaned)
# 3. (punctuation_removed)
# 4. (stopwords_removed)
# 5. (alternative_stopwords_used)
# 6. (words_shortened)

# embedding_method:
# 1. Bert (will use Bert large)
# 2. DeBerta (will use DeBerta-v2-xlarge)
# 3. GloVe
# 4. RoBerta (will use RoBerta-large)
# 5. Word2Vec

input_cvs_file_name_array = ['comp401-Fall-2012_labeled (new).csv', 'comp401-Fall-2013_labeled (new).csv', 'comp401-Fall-2015_labeled (new).csv', 'comp401-Fall-2016_labeled (new).csv', 'comp401-Fall-2017_labeled (new).csv', 'comp401-Fall-2018_labeled (new).csv', 'comp411-Fall-2019_labeled (new).csv', 'comp411-Fall-2020_labeled (new).csv', 'comp411-Spring-2019_labeled (new).csv', 'comp411-Spring-2020A_labeled (new).csv', 'comp411-Spring-2020B_labeled (new).csv', 'comp426-Fall-2019_labeled (new).csv', 'comp426-Fall-2020_labeled (new).csv']
data_clean_type = "(cleaned)(punctuation_removed)(stopwords_removed)(alternative_stopwords_used)"
embedding_method = "Word2Vec"

word_embedding(input_cvs_file_name_array, data_clean_type, embedding_method)