In [None]:
#imports
import re
import os
import csv
import string
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.tokenize import sent_tokenize 
import spacy
import pandas as pd
import numpy as np
import time
from tqdm import tqdm, trange
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import gensim
from gensim.utils import simple_preprocess
import gensim.corpora as corpora
from pprint import pprint
from numpy.random import seed
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
import keras
from keras import backend as K
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from itertools import chain
from tensorflow.keras import Model,Input
from tensorflow.keras.layers import LSTM,Embedding,Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D,Bidirectional
from tensorflow.keras.callbacks import TensorBoard


plt.style.use('seaborn')
plt.rcParams["figure.figsize"] = (16,10)
nlp = spacy.load("en_core_web_lg")


In [2]:
## This cell is blanked intentionally to write comments or other stuffs !

In [4]:
# System Training with deep learning LSTM model ; to use this need to uncomment all the codes of this block
# As metrices have been removed from Keras from version 2.0, they need to be calculated manually.

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

# getting the data
file = "EDLC_ner_dataset_V3.csv"
data = pd.read_csv(file, encoding = "utf-8")

# drop the docId column, it is not needed
data = data.drop('docId', 1)


# print(data.head())

words = list(set(data["Word"].values))
words.append("ENDPAD")
num_words = len(words)

# print(f"Total number of unique words in dataset: {num_words}")

tags = list(set(data["Tag"].values))
num_tags = len(tags)
num_tags
# print("List of tags: " + ', '.join([tag for tag in tags]))
# print(f"Total Number of tags {num_tags}")

class Get_sentence(object):
    def __init__(self,data):
        self.n_sent = 1
        self.data = data
        agg_func = lambda s:[(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                    s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("SentenceID").apply(agg_func)
        self.sentences = [s for s in self.grouped]
        
        
getter = Get_sentence(data)
sentence = getter.sentences


word_idx = {w : i + 1 for i ,w in enumerate(words)}
tag_idx =  {t : i for i ,t in enumerate(tags)}

print("Word idx: ",word_idx)
# print(tag_idx)

max_len = 90
X = [[word_idx[w[0]] for w in s] for s in sentence]
X = pad_sequences(maxlen = max_len, sequences = X, padding = 'post', value = num_words - 1)

y = [[tag_idx[w[1]] for w in s] for s in sentence]
y = pad_sequences(maxlen = max_len, sequences = y, padding = 'post', value = tag_idx['O'])
y = [to_categorical(i, num_classes = num_tags) for i in  y]

x_train,x_test,y_train,y_test = train_test_split(X, y,test_size = 0.1, random_state = 1)

input_word = Input(shape = (max_len,))
model = Embedding(input_dim = num_words, output_dim = max_len, input_length = max_len)(input_word)
model = SpatialDropout1D(0.01)(model)
model = Bidirectional(LSTM(units = 200,return_sequences = True, recurrent_dropout = 0.01))(model)
out = TimeDistributed(Dense(num_tags,activation = 'softmax'))(model)
model = Model(input_word,out)
model.compile(optimizer = 'adam',loss = 'categorical_crossentropy',metrics = ['accuracy',precision_m, recall_m, f1_m])

tensorboard_cbk = TensorBoard(log_dir="logs/")
history = model.fit(x_train, np.array(y_train), batch_size = 16, verbose = 1, epochs = 1, validation_split = 0.2,callbacks=[tensorboard_cbk])


print("Model Evaluation \n===========================\n")
model.evaluate(x_test, np.array(y_test))

rand_sent = np.random.randint(0, x_test.shape[0]) # get a random sentence
p = model.predict(np.array([x_test[rand_sent]]))
p = np.argmax(p, axis = -1)

y_true = np.argmax(np.array(y_test), axis = -1)[rand_sent] # get actual tags for random sentense

print("{:20}{:20}\t{}\n".format("Word", "True", "Pred"))
print("-" * 55)

for (w, t, pred) in zip(x_test[rand_sent], y_true, p[0]):
    print("{:20}{:20}\t{}".format(words[w - 1], tags[t], tags[pred]))
    



## Plotting all the metrices in a single figure
pd.DataFrame(history.history).plot(figsize=(16,10))
plt.show()

# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()



###########################################################################################################
#                                        Saving and loading model                                         #
###########################################################################################################

### As model can not be restored with custom metrices, hence only accuracy and loss is there and compiled the model again
### Then fit the model and save the model

# model.compile(optimizer = 'adam',loss = 'categorical_crossentropy',metrics = ['accuracy'])

# model.fit(x_train, np.array(y_train), batch_size = 16, verbose = 1, epochs = 3, validation_split = 0.2)

# model.evaluate(x_test, np.array(y_test))


# ## Save model
# model.save('matrec.h5')


###########################################################################################################
#                                        Restoring a Saved model                                          #
###########################################################################################################

## restoring model and checking model
## Currently model restoration feature is off. to load the model just uncomment the following lines and provide saved model name


# new_model = keras.models.load_model('matrec.h5')
# print("Loaded Model Evaluation \n===========================\n")
# new_model.evaluate(x_test, np.array(y_test))


############## Model saving and resoration ends! ################################





In [None]:
def clean_text(text):
    text = re.sub(r"(\[[0-9].?\])|(\[[0-9].?.\])|(\[[0-9][0-9],[0-9][0-9]\])","",text)
    text = re.sub(r"\s{2,}"," ",text)
    text = re.sub(r"[A-Z]\s[A-Z]","",text)
    text = re.sub(r"(?<=[A-Z].)\s(?=[0-9])|(?<=[0-9])\s(?=[A-Z])|(?<=[A-Z])\s(?=[0-9])|(?<=\d)\s(?=\d)","",text)
    text = re.sub(r"\(","",text)
    text = re.sub(r"\)","",text)
    text = re.sub(r"\[","",text)
    text = re.sub(r"\]","",text)
    text = re.sub(r",","",text)
    text = re.sub(r";","",text)
    text = re.sub(r"~","",text)
    text = re.sub(r"'","",text)
    text = re.sub(r"\sÀ","-",text)
    text = re.sub(r"À","-",text)
    text = re.sub(r"Á","ση",text)
    text = re.sub(r"•","°",text)
    text = re.sub(r'[‘’“”…]', '', text)
    text = re.sub(r"Figure", '', text)
    text = re.sub(r"figure", '', text)
    text = re.sub(r"FIGURE", '', text)
    return text

In [5]:
# prediction method for any text input, will be used for each sentence after the cleaning method is applied
# This is for deep learning model

def create_test_input_from_text(text):
    text = clean_text(text)
    text = re.sub(r"\.","",text)
    text = re.sub(r"\/","",text)
    word_list = word_tokenize(text)
#     word_list = text.split(" ")
#     word_list = pad_sequences(maxlen = max_len, sequences = word_list, padding = 'post', value = num_words - 1)
    x_new = []
    for word in word_list:
        try:
            x_new.append(word_idx[word])
        except:
            print("")
        
    p = model.predict(np.array([x_new]))
    p = np.argmax(p, axis = -1)
    print("{:20}\t{}\n".format("Word", "Prediction"))
    print("-" * 35)

    for (w, pred) in zip(range(len(x_new)), p[0]):
        print("{:20}\t{}".format(word_list[w], tags[pred]))
    
    print(word_list,'\n=====\n',x_new)

# test_inputs = "Lithium bistrifluoromethane sulfonyl imide LiTFSI salt are potentially a good alternative to LiPF6 since it could both improve the chemical and thermal stability as salt for electrolyte"
# create_test_input_from_text(test_inputs)

# test_inputs = "Li metal foil as the counter electrode and 1 M LiPF6 solution with ethylene carbonate-dimethyl carbonate as the electrolyte"
# create_test_input_from_text(test_inputs)

In [8]:

def get_wv_sim(doc1,doc2):
    doc1 = nlp(doc1)
    doc2 = nlp(doc2)
    # Similarity of two documents
    return doc1.similarity(doc2)


def get_primary_kw_sim(text):
    doc2_f = open("primary-kw.txt","r",encoding = "utf-8")
    doc1 = text
    doc2 = doc2_f.read()
#     doc1 = clean_text(doc1)
    doc2 = clean_text(doc2)
    jaccard_sim = get_jaccard_sim(doc1,doc2)
    cosine_sim = get_cosine_sim(doc1,doc2)
    wv_sim = get_wv_sim(doc1,doc2)
    doc2_f.close()
    return jaccard_sim, cosine_sim, wv_sim



In [9]:
## Path varibale defines from where the text files will be loaded
## for current get file method, need to put only extension of file
## then get full filename using
## filename = path+file 
file_path = "pred-test"
def get_filenames(file_extension):
    files = []
    for r,d, f in os.walk(file_path+"/"):
        for file in f:
            if "."+file_extension in file:
                files.append(file)
    return files

In [None]:
files_list = get_filenames('txt')

for files in files_list:
    print (files)


In [None]:
for file in files_list:
    file_handler = open(file_path+"/"+file, "r", encoding="utf-8")
    text_content = file_handler.read()
    create_test_input_from_text(text_content)
print("Done!")