In [1]:
import re
import numpy as np

from gensim.models import Word2Vec
from nltk.corpus import gutenberg
from multiprocessing import Pool
from scipy import spatial

In [2]:
file1 = open('data/arxiv_3.txt', 'r') 
dataset = file1.readlines()

In [3]:
print('Type of corpus: ', type(dataset))
print('Length of corpus: ', len(dataset))

Type of corpus:  <class 'list'>
Length of corpus:  216


In [5]:
def text_preprocessing(
    text:list,
    punctuations = r'''!()-[]{};:'"\,<>./?@#$%^&*_“~+''',
    stop_words=['and', 'a', 'is', 'the', 'in', 'be', 'will']
    )->list:
    """
    A method to preproces text
    """
    for x in text.lower(): 
        if x in punctuations: 
            text = text.replace(x, "")

    # Removing words that have numbers in them
    text = re.sub(r'\w*\d\w*', '', text)

    # Removing digits
    text = re.sub(r'[0-9]+', '', text)

    # Cleaning the whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    # Setting every word to lower
    text = text.lower()

    # Converting all our text to a list 
    text = text.split(' ')

    # Droping empty strings
    text = [x for x in text if x!='']

    # Droping stop words
    text = [x for x in text if x not in stop_words]

    return text

In [8]:
def preprocess(dataset: list)->list:
    processed = []
    for line in dataset:
        text = text_preprocessing(line)
        processed.append(text)
    return processed    

In [9]:
processed_dataset = preprocess(dataset)

In [10]:
print(processed_dataset[0])    # title, author, and year
print(processed_dataset[1])
print(processed_dataset[10])

['xlnet', 'generalized', 'autoregressive', 'pretraining', 'language', 'understanding']
['modeling', 'bidirectional', 'contexts', 'denoising', 'autoencoding', 'pretraining', 'bert', 'autoregressive', 'language', 'modeling']
['natural', 'language', 'hierarchically', 'structured', 'long', 'short', 'term', 'memory', 'lstm', 'recurrent', 'architecture', 'rnn', 'lstm', 'onlstm', 'language', 'modeling', 'unsupervised', 'parsing', 'targeted', 'syntactic', 'evaluation', 'logical', 'inference']


In [11]:
model = Word2Vec(sentences = processed_dataset, size = 100, sg = 1, window = 3, min_count = 1, iter = 10, workers = Pool()._processes)

In [12]:
model.init_sims(replace = True)

In [13]:
model.most_similar('bert')

  """Entry point for launching an IPython kernel.


[('transformers', 0.7637602686882019),
 ('detection', 0.7115647196769714),
 ('word', 0.7109436988830566),
 ('training', 0.7088503837585449),
 ('language', 0.7061129212379456),
 ('attention', 0.6916610598564148),
 ('segmentation', 0.6888375282287598),
 ('translation', 0.6876645088195801),
 ('semantic', 0.6861536502838135),
 ('loss', 0.6857756972312927)]

In [18]:
v1 = model['bert']
v2 = model['roberta']

  """Entry point for launching an IPython kernel.
  


In [19]:
# define a function that computes cosine similarity between two words
def cosine_similarity(v1, v2):
    return 1 - spatial.distance.cosine(v1, v2)

In [20]:
cosine_similarity(v1, v2)

0.4118836522102356