In [1]:
import numpy as np
import pandas as pd
import pickle
import torch
from tqdm.auto import tqdm, trange

from build_embeddings_pdf import embedding
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings
from flair.data import Sentence
from sklearn.feature_extraction.text import TfidfVectorizer

with open('abstract_list.pkl', 'rb') as f:
    abstract_list = pickle.load(f)
with open('title_list.pkl', 'rb') as f:
    title_list = pickle.load(f)

In [2]:
vectorizer = TfidfVectorizer(max_df=0.9,min_df=2,stop_words='english')
vectorizer.fit(abstract_list)

# returns top n elements with most tf-idf values
def find_top_n(text,n):

    response = vectorizer.transform([text])

    feature_array = np.array(vectorizer.get_feature_names())
    tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]

    return feature_array[tfidf_sorting][:n]

In [3]:
words_tfidf = []

# Concatenating tf-idf words into sentences
for text in abstract_list:
    words_tfidf.append(" ".join(find_top_n(text,3)))

In [4]:
cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)

In [5]:
# Creating flair embedding
e4 = embedding()

In [6]:
# returns flair embedding of a text in torch tensor format
def get_embedding(text):
    sentence = Sentence(text)
    e4.embed(sentence)
    return sentence.embedding

In [7]:
# labels to find for each document
label_1 = get_embedding('selective laser melting')
label_2 = get_embedding('direct metal laser sintering')

label_3 = get_embedding('fused deposition modeling')
label_4 = get_embedding('fused filament fabrication')
label_5 = get_embedding('extrusion based additive manufacturing')

metal = get_embedding('Metal')
ceramic = get_embedding('Ceramic')
polymer = get_embedding('Polymer')



In [8]:
# embeddings for concatenated tf-idf sentences
tfidf_sentences = [get_embedding(text) for text in tqdm(words_tfidf)]

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [9]:
# Finding the cosine similarities between documents and labels

label_list_1 = []   
label_list_2 = []

for text in tfidf_sentences:

    lbl= []
    
    lbl.append(cos(text,label_1))
    lbl.append(cos(text,label_2))
    lbl.append(cos(text,label_3))
    lbl.append(cos(text,label_4))
    lbl.append(cos(text,label_5))

    if lbl.index(max(lbl))==0 or lbl.index(max(lbl))==1:
        label_list_1.append('Selective laser melting (SLM) or direct metal laser sintering (DMLS)')
    else:
        label_list_1.append('Fused deposition modeling (FDM) or fused filament fabrication (FFF) or extrusion based additive manufacturing')  

for text in tfidf_sentences:

    lbl= []
    
    lbl.append(cos(text,metal))
    lbl.append(cos(text,ceramic))
    lbl.append(cos(text,polymer))

    if lbl.index(max(lbl))==0:
        label_list_2.append('Metal')
    elif lbl.index(max(lbl))==1:
        label_list_2.append('Ceramic')
    else:
        label_list_2.append('Polymer')

In [11]:
d = {'Documents':abstract_list,'Labels 1':label_list_1,'Labels 2':label_list_2}
df = pd.DataFrame(d)

In [12]:
df.head()

Unnamed: 0,Documents,Labels 1,Labels 2
0,we study the electronic states of giant single...,Selective laser melting (SLM) or direct metal ...,Polymer
1,the recursion and pathintegral methods are app...,Fused deposition modeling (FDM) or fused filam...,Polymer
2,we analytically study phonon transmission and ...,Fused deposition modeling (FDM) or fused filam...,Polymer
3,we study both analytically and numerically pho...,Selective laser melting (SLM) or direct metal ...,Metal
4,we present a model for thin film growth by par...,Fused deposition modeling (FDM) or fused filam...,Polymer


In [14]:
df["Labels 1"].value_counts()

Selective laser melting (SLM) or direct metal laser sintering (DMLS)                                             6147
Fused deposition modeling (FDM) or fused filament fabrication (FFF) or extrusion based additive manufacturing    3853
Name: Labels 1, dtype: int64

In [13]:
df["Labels 2"].value_counts()

Polymer    5568
Metal      2587
Ceramic    1845
Name: Labels 2, dtype: int64