In [1]:
import numpy as np
import pandas as pd
import pickle
import torch
from tqdm.auto import tqdm, trange

from build_embeddings_pdf import embedding
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings
from flair.data import Sentence
from sklearn.feature_extraction.text import TfidfVectorizer

with open('abstract_list.pkl', 'rb') as f:
    abstract_list = pickle.load(f)
with open('title_list.pkl', 'rb') as f:
    title_list = pickle.load(f)

In [2]:
vectorizer = TfidfVectorizer(max_df=0.9,min_df=2,stop_words='english')
vectorizer.fit(abstract_list)

# returns top n elements with most tf-idf values
def find_top_n(text,n):

    response = vectorizer.transform([text])

    feature_array = np.array(vectorizer.get_feature_names())
    tfidf_sorting = np.argsort(response.toarray()).flatten()[::-1]

    return feature_array[tfidf_sorting][:n]

In [3]:
words_tfidf = []

# Concatenating tf-idf words into sentences
for text in abstract_list:
    words_tfidf.append(" ".join(find_top_n(text,3)))

In [4]:
cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)

In [5]:
# Creating flair embedding
e4 = embedding()

In [6]:
# returns flair embedding of a text in torch tensor format
def get_embedding(text):
    sentence = Sentence(text)
    e4.embed(sentence)
    return sentence.embedding

In [7]:
# labels to find for each document
label_1 = get_embedding('selective laser melting')
label_2 = get_embedding('direct metal laser sintering')

label_3 = get_embedding('fused deposition modeling')
label_4 = get_embedding('fused filament fabrication')
label_5 = get_embedding('extrusion based additive manufacturing')

metal = get_embedding('metal')
ceramic = get_embedding('ceramic')
polymer = get_embedding('polymer')



In [8]:
# embeddings for concatenated tf-idf sentences
tfidf_sentences = [get_embedding(text) for text in tqdm(words_tfidf)]

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




Finding the cosine similarities between documents and labels

In [9]:
# Labels : How materials are produced
# Material : The material type after producuction

# SLM : Selective laser melting
# DMLS : Direct metal laser sintering
# FDM : Fused deposition modeling 
# FFF : Fused filament fabrication
# EAM: Extrusion-based additive manufacturing
production = []
production_cos_score = []

# Metal
# Polymer 
# Ceramic  
material = []  
material_cos_score = []

In [10]:
# Labeling for production type
for text in tfidf_sentences:

    lbl= []
    
    lbl.append(cos(text,label_1))
    lbl.append(cos(text,label_2))
    lbl.append(cos(text,label_3))
    lbl.append(cos(text,label_4))
    lbl.append(cos(text,label_5))

    if lbl.index(max(lbl))==0 or lbl.index(max(lbl))==1:
        production.append('SLM or DMLS')
        production_cos_score.append(max(lbl))
    else:
        production.append('FDM or FFF or EAM')
        production_cos_score.append(max(lbl))

In [11]:
# Finding cosine scores
production_cos_score = list(map(float,production_cos_score))
production_cos_score = list(map(lambda x: round(x,2), production_cos_score)) 

In [12]:
# Labeling for material type
for text in tfidf_sentences:

    lbl= []
    
    lbl.append(cos(text,metal))
    lbl.append(cos(text,ceramic))
    lbl.append(cos(text,polymer))

    if lbl.index(max(lbl))==0:
        material.append('Metal')
        material_cos_score.append(max(lbl))
    elif lbl.index(max(lbl))==1:
        material.append('Ceramic')
        material_cos_score.append(max(lbl))
    else:
        material.append('Polymer')
        material_cos_score.append(max(lbl))

In [13]:
# Finding cosine scores
material_cos_score = list(map(float,material_cos_score))
material_cos_score = list(map(lambda x: round(x,2), material_cos_score)) 

In [14]:
d = {'Documents/Abstracts':abstract_list,'Production':production,'P_score':production_cos_score,'Material':material,'M_score':material_cos_score}
df = pd.DataFrame(d)

In [15]:
df["Production/Material"] = df["Production"] + " / " + df["Material"]

In [16]:
df.head(20)

Unnamed: 0,Documents/Abstracts,Production,P_score,Material,M_score,Production/Material
0,we study the electronic states of giant single...,SLM or DMLS,0.51,Polymer,0.4,SLM or DMLS / Polymer
1,the recursion and pathintegral methods are app...,FDM or FFF or EAM,0.55,Polymer,0.38,FDM or FFF or EAM / Polymer
2,we analytically study phonon transmission and ...,FDM or FFF or EAM,0.58,Polymer,0.33,FDM or FFF or EAM / Polymer
3,we study both analytically and numerically pho...,SLM or DMLS,0.59,Metal,0.43,SLM or DMLS / Metal
4,we present a model for thin film growth by par...,FDM or FFF or EAM,0.6,Polymer,0.38,FDM or FFF or EAM / Polymer
5,we study a class of models for brittle fractur...,SLM or DMLS,0.5,Metal,0.37,SLM or DMLS / Metal
6,the structure of co adsorbates on the surface ...,FDM or FFF or EAM,0.44,Ceramic,0.27,FDM or FFF or EAM / Ceramic
7,a firstprinciples atomic orbitalbased electron...,SLM or DMLS,0.46,Metal,0.39,SLM or DMLS / Metal
8,the traditional magnetic storage mechanisms bo...,SLM or DMLS,0.51,Ceramic,0.37,SLM or DMLS / Ceramic
9,zn in gan forms an efficient radiative center ...,FDM or FFF or EAM,0.41,Polymer,0.32,FDM or FFF or EAM / Polymer


In [17]:
df["Production"].value_counts()

SLM or DMLS          6147
FDM or FFF or EAM    3853
Name: Production, dtype: int64

In [18]:
df["Material"].value_counts()

Polymer    5866
Metal      2450
Ceramic    1684
Name: Material, dtype: int64

In [19]:
df[df['M_score']== max(df['M_score'])]

Unnamed: 0,Documents/Abstracts,Production,P_score,Material,M_score,Production/Material
9955,pfo powders in hexagonal structure have been s...,FDM or FFF or EAM,0.59,Ceramic,0.85,FDM or FFF or EAM / Ceramic


In [20]:
max_material = df.iloc[9955]['Documents/Abstracts']
df.iloc[9955]['Documents/Abstracts']

'pfo powders in hexagonal structure have been synthesized by solgel process using lead acetate glycerin and ferric acetylacetonate as the precursor ceramics were obtained by sintering the powders at c for hour distorted flaky hexahedron grains are frequently observed in the sem images of sintered ceramics large spontaneous polarization was observed in ceramic at room temperature exhibiting a clear ferroelectric hysteresis loop the remnant polarization of ceramic is estimated to be the distortion of hexahedron grains as well as the fe oxygen octahedron in its perovskitelike hexagonal unit cell is proposed to be the origin of polarization in ceramics meanwhile ceramics demonstrate strong ferromagnetism at room temperature simultaneous occurrence of large ferroelectricity and strong ferromagnetism in ceramics holds promise for its application in new generation of electronic devices as a practical multiferroic candidate in single phase'

In [21]:
import re

len(re.findall('ceramic',max_material)) # 7 ceramic in abstract with most score

7

In [22]:
df[df['P_score']== max(df['P_score'])]

Unnamed: 0,Documents/Abstracts,Production,P_score,Material,M_score,Production/Material
2686,pulsed arf laser annealing in air and in hydro...,SLM or DMLS,0.75,Ceramic,0.5,SLM or DMLS / Ceramic


In [23]:
max_production = df.iloc[2686]['Documents/Abstracts']
df.iloc[2686]['Documents/Abstracts']

'pulsed arf laser annealing in air and in hydrogen atmosphere improves the optical properties of zno nanostructured films independently on the ambient atmosphere laser annealing produces two major effects on the photoluminescence pl spectra first the efficiency of the exciton pl increases due to decrease of the number of nonradiative recombination centers second the intensity of the defectrelated orange band decreases because of the removing of excessive oxygen trapped into the films during deposition however annealing in the ambient air also increases the intensity of the green band related to oxygen vacancies we show that the combination of laser annealing and passivation of oxygen vacancies by hydrogen results in films free of defectrelated emission and keeps intact their nanostructural character'