In [34]:
import matplotlib.pyplot as plt
from tqdm import tqdm
from keras.models import Input, Model
from keras.layers import Dense
from scipy import sparse
import numpy as np 
from nltk.corpus import stopwords
import nltk
import pandas as pd
import re
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



/kaggle/input/quantum-physics-articles-on-arxiv-1994-to-2009/ArXiv_old.csv
/kaggle/input/sample-word-embedding-data/sample.csv
/kaggle/input/titanates/samplefile.csv


In [35]:
from keras.callbacks import EarlyStopping
earlystop = EarlyStopping(monitor = 'val_loss',
                          min_delta = 0,
                          patience = 5,
                          verbose = 1,
                          restore_best_weights = True)

In [36]:
#creating a method that creates a dictionary where the keys are unique words and key values are indices
def create_unique_word_dict(text:list) -> dict:
    #obtaining the unique words from the text and sorting them alphabetically
    words = list(set(text))
    words.sort()
    #creating the dictionary for the unique words
    unique_word_dict = {}
    for i, word in enumerate(words):
        unique_word_dict.update({
            word:i
        })
    return unique_word_dict
def text_preprocessing(
text:list, 
punctuations = r'''!()-[]{};:'"\,<>./?@#$%^&*_"~''',
stop_words = nltk.corpus.stopwords.words('english') ) -> list:
    for x in text.lower():
        if x in punctuations:
            text = text.replace(x, "")
    #removing numbers
    text = re.sub(r'\w*\d\w*', '', text)
    
    #removing digits
    text = re.sub(r'[0-9]+', '', text)
    #removing whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    #lowercase
    text = text.lower()
    #converting the text to a list
    text = text.split(' ')
    #dropping empty strings
    text = [x for x in text if x!='']
    #dropping stopwords
    text = [x for x in text if x not in stop_words]
    
    return text
#function that finds most similar word based on Eucledian distance or cosine distance
def euclidean(vec1:np.array, vec2:np.array) -> float:
    #calculating the euclidean distance between two vectors
    return np.sqrt(np.sum((vec1-vec2)**2))
def find_similar(word:str, embedding_dict:dict, top_n=10) -> list:
    #creating a method that finds the most similar words based on learned embeddings
    dist_dict = {}
    word_vector = embedding_dict.get(word, [])
    if len(word_vector) > 0:
        for key, value in embedding_dict.items():
            if key!=word:
                dist = euclidean(word_vector, value)
                dist_dict.update({
                    key: dist
                })
        return sorted(dist_dict.items(), key=lambda x: x[1])[0:top_n]
    
    
    
    
    

In [37]:
data = pd.read_csv('/kaggle/input/quantum-physics-articles-on-arxiv-1994-to-2009/ArXiv_old.csv', error_bad_lines=False)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27738 entries, 0 to 27737
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       27738 non-null  object
 1   abstract    27738 non-null  object
 2   categories  27738 non-null  object
 3   created     27738 non-null  object
 4   id          27738 non-null  object
 5   doi         17184 non-null  object
dtypes: object(6)
memory usage: 1.3+ MB


In [38]:
data = data.loc[:100]
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101 entries, 0 to 100
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   title       101 non-null    object
 1   abstract    101 non-null    object
 2   categories  101 non-null    object
 3   created     101 non-null    object
 4   id          101 non-null    object
 5   doi         66 non-null     object
dtypes: object(6)
memory usage: 4.9+ KB


In [39]:
data.head()

Unnamed: 0,title,abstract,categories,created,id,doi
0,A limit relation for entropy and channel capac...,"In a quantum mechanical model, Diosi, Feldmann...","['quant-ph', 'cs.IT', 'math.IT']",2007-04-01,704.0046,10.1063/1.2779138
1,Visualizing Teleportation,A novel way of picturing the processing of qua...,"['physics.ed-ph', 'quant-ph']",2007-04-02,704.0051,
2,Lower ground state due to counter-rotating wav...,We consider a single ion confined in a trap un...,['quant-ph'],2007-04-01,704.0117,10.1088/0953-4075/40/11/002
3,A Single Trapped Ion as a Time-Dependent Harmo...,We show how a single trapped ion may be used t...,['quant-ph'],2007-04-02,704.0135,10.1103/PhysRevA.76.052105
4,"Topological defects, geometric phases, and the...",Recent reports on the intriguing features of v...,['quant-ph'],2007-04-02,704.0137,


In [40]:
#texts = data.abstract

texts = [x for x in data['abstract']]
#defining the window size of the context
window = 5
#creating a placeholder for the scanning of the wordlist
word_lists = []
all_text = []

for text in texts:
    text = text_preprocessing(text)
    all_text += text
    #creating a context dictionary
    for i, word in enumerate(text):
        for w in range(window):
            #obtaining context before the window words
            if i + 1+ w <len(text):
                word_lists.append([word] + [text[(i + 1 + w)]])
            #obtaining context behind the window word
            if i - w - 1 >= 0:
                word_lists.append([word] + [text[(i-w-1)]])
unique_word_dict = create_unique_word_dict(all_text)
#defining the number of features(our unique words)
n_words = len(unique_word_dict)
#obtaining all the uniue words
words = list(unique_word_dict.keys())
#creation of X and Y matrices using OHE
X = []
Y = []
for i, word_list in tqdm(enumerate(word_lists)):
    #obtaining indices
    main_word_index = unique_word_dict.get(word_list[0])
    context_word_index = unique_word_dict.get(word_list[1])
    #creating placeholders
    X_row = np.zeros(n_words)
    Y_row = np.zeros(n_words)
    
    #OHEing the main word
    X_row[main_word_index] = 1
    #OHEing the y matrix words
    Y_row[context_word_index] = 1
    
    #appending the main matrices
    X.append(X_row)
    Y.append(Y_row)
    


59750it [00:03, 18172.51it/s]


In [41]:
#converting the matrices to sparse format to adress the majority of the zeros in the data
X = sparse.csr_matrix(X)
Y = sparse.csr_matrix(Y)

In [42]:
#defining the embedding size
emb_size = 100
#creating the neural network
inp = Input(shape = (X.shape[1], ))
x = Dense(units = emb_size, activation = 'linear')(inp)
x = Dense(units = Y.shape[1], activation='softmax')(x)
model = Model(inputs=inp, output=x)
model.compile(loss='categorical_crossentropy', optimizer='adam')
#optimization of the network weights
model.fit(x=X, y=Y, batch_size=256, epochs=150)


  import sys


TypeError: unsupported operand type(s) for +: 'EarlyStopping' and 'list'

In [None]:
#obtaining the weights from the neural network
#the weights are also known as the word embeddings
weights = model.get_weights()[0]
#creating a dictionary to store the computed weights
#key is the unique word
#value is the associated vector
embedding_dict = {}
for word in words:
    embedding_dict.update({
        word: weights[unique_word_dict.get(word)]
        })
#Vizualizing the embeddings
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 10))
for word in list(unique_word_dict.keys()):
  coord = embedding_dict.get(word)
  plt.scatter(coord[0], coord[1])
  plt.annotate(word, (coord[0], coord[1]))