In [1]:
import numpy as np
import pandas as pd
import pickle
import torch
from tqdm.auto import tqdm, trange

from utils import embedding, find_top_n , tfidf_sentences, production_labeling, material_labeling, feature_labeling, get_embedding 

from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings
from flair.data import Sentence
from sklearn.feature_extraction.text import TfidfVectorizer

with open('abstract_list.pkl', 'rb') as f:
    abstract_list = pickle.load(f)
with open('title_list.pkl', 'rb') as f:
    title_list = pickle.load(f)

In [2]:
vectorizer = TfidfVectorizer(max_df=0.9,min_df=2,stop_words='english')
vectorizer.fit(abstract_list)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=None,
                min_df=2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [3]:
cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)

In [4]:
# Creating flair embedding
#e4 = embedding()

In [5]:
# returns flair embedding of a text in torch tensor format
#def get_embedding(text):
#    sentence = Sentence(text)
#    e4.embed(sentence)
#    return sentence.embedding

In [6]:
#tfidf = []

# Concatenating tf-idf words into sentences
#for text in abstract_list:
#    tfidf.append(" ".join(find_top_n(text,vectorizer,3)))

In [7]:
# embeddings for concatenated tf-idf sentences
#tfidf_sentences = [get_embedding(text) for text in tqdm(words_tfidf)]

### Finding the cosine similarities between documents and labels

* **Labels** : How materials are produced
* **Material** : The material type after producuction


* **SLM** : Selective laser melting
* **DMLS** : Direct metal laser sintering
* **FDM** : Fused deposition modeling 
* **FFF** : Fused filament fabrication
* **EAM** : Extrusion-based additive manufacturing


* Metal
* Polymer 
* Ceramic  

In [8]:
# labels to find for each document
label_1 = get_embedding('selective laser melting')
label_2 = get_embedding('direct metal laser sintering')

label_3 = get_embedding('fused deposition modeling')
label_4 = get_embedding('fused filament fabrication')
label_5 = get_embedding('extrusion based additive manufacturing')

metal = get_embedding('metal')
ceramic = get_embedding('ceramic')
polymer = get_embedding('polymer')

feature_1 = get_embedding('fracture toughness')
feature_2 = get_embedding('tensile strength')
feature_3 = get_embedding('yield strength')
feature_4 = get_embedding('elastic modulus')
feature_5 = get_embedding('strain fracture break')
feature_6 = get_embedding('weibull modulus')



In [9]:
#new 
sentences = tfidf_sentences(abstract_list,vectorizer,3)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [10]:
# Labeling for production type
production , production_cos_score = production_labeling(sentences,cos,label_1,label_2,label_3,label_4,label_5)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [11]:
# Labeling for material type
material , material_cos_score = material_labeling(sentences,cos,metal,ceramic,polymer)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [12]:
d = {'Documents/Abstracts':abstract_list,'Production':production,'P_score':production_cos_score,'Material':material,'M_score':material_cos_score}
df = pd.DataFrame(d)

In [13]:
df["Production/Material"] = df["Production"] + " / " + df["Material"]

In [14]:
df.head(20)

Unnamed: 0,Documents/Abstracts,Production,P_score,Material,M_score,Production/Material
0,we study the electronic states of giant single...,SLM or DMLS,0.51,Polymer,0.4,SLM or DMLS / Polymer
1,the recursion and pathintegral methods are app...,FDM or FFF or EAM,0.55,Polymer,0.38,FDM or FFF or EAM / Polymer
2,we analytically study phonon transmission and ...,FDM or FFF or EAM,0.58,Polymer,0.33,FDM or FFF or EAM / Polymer
3,we study both analytically and numerically pho...,SLM or DMLS,0.59,Metal,0.43,SLM or DMLS / Metal
4,we present a model for thin film growth by par...,FDM or FFF or EAM,0.6,Polymer,0.38,FDM or FFF or EAM / Polymer
5,we study a class of models for brittle fractur...,SLM or DMLS,0.5,Metal,0.37,SLM or DMLS / Metal
6,the structure of co adsorbates on the surface ...,FDM or FFF or EAM,0.44,Ceramic,0.27,FDM or FFF or EAM / Ceramic
7,a firstprinciples atomic orbitalbased electron...,SLM or DMLS,0.46,Metal,0.39,SLM or DMLS / Metal
8,the traditional magnetic storage mechanisms bo...,SLM or DMLS,0.51,Ceramic,0.37,SLM or DMLS / Ceramic
9,zn in gan forms an efficient radiative center ...,FDM or FFF or EAM,0.41,Polymer,0.32,FDM or FFF or EAM / Polymer


In [15]:
df["Production"].value_counts()

SLM or DMLS          6147
FDM or FFF or EAM    3853
Name: Production, dtype: int64

In [16]:
df["Material"].value_counts()

Polymer    5866
Metal      2450
Ceramic    1684
Name: Material, dtype: int64

In [17]:
df[df['M_score']== max(df['M_score'])]

Unnamed: 0,Documents/Abstracts,Production,P_score,Material,M_score,Production/Material
9955,pfo powders in hexagonal structure have been s...,FDM or FFF or EAM,0.59,Ceramic,0.85,FDM or FFF or EAM / Ceramic


In [18]:
max_material = df.iloc[9955]['Documents/Abstracts']
df.iloc[9955]['Documents/Abstracts']

'pfo powders in hexagonal structure have been synthesized by solgel process using lead acetate glycerin and ferric acetylacetonate as the precursor ceramics were obtained by sintering the powders at c for hour distorted flaky hexahedron grains are frequently observed in the sem images of sintered ceramics large spontaneous polarization was observed in ceramic at room temperature exhibiting a clear ferroelectric hysteresis loop the remnant polarization of ceramic is estimated to be the distortion of hexahedron grains as well as the fe oxygen octahedron in its perovskitelike hexagonal unit cell is proposed to be the origin of polarization in ceramics meanwhile ceramics demonstrate strong ferromagnetism at room temperature simultaneous occurrence of large ferroelectricity and strong ferromagnetism in ceramics holds promise for its application in new generation of electronic devices as a practical multiferroic candidate in single phase'

In [19]:
import re

len(re.findall('ceramic',max_material)) # 7 ceramic in abstract with most score

7

In [20]:
df[df['P_score']== max(df['P_score'])]

Unnamed: 0,Documents/Abstracts,Production,P_score,Material,M_score,Production/Material
2686,pulsed arf laser annealing in air and in hydro...,SLM or DMLS,0.75,Ceramic,0.5,SLM or DMLS / Ceramic


In [21]:
max_production = df.iloc[2686]['Documents/Abstracts']
df.iloc[2686]['Documents/Abstracts']

'pulsed arf laser annealing in air and in hydrogen atmosphere improves the optical properties of zno nanostructured films independently on the ambient atmosphere laser annealing produces two major effects on the photoluminescence pl spectra first the efficiency of the exciton pl increases due to decrease of the number of nonradiative recombination centers second the intensity of the defectrelated orange band decreases because of the removing of excessive oxygen trapped into the films during deposition however annealing in the ambient air also increases the intensity of the green band related to oxygen vacancies we show that the combination of laser annealing and passivation of oxygen vacancies by hydrogen results in films free of defectrelated emission and keeps intact their nanostructural character'

### 3rd level of classification

* **Features** : Features of materials


* Fracture toughness or Work of fracture
* Tensile strength or ultimate tensile strength
* Yield strength
* Elastic modulus or Young’s modulus
* Strain at break or strain at fracture or fracture strain
* Weibull modulus

In [23]:
# Labeling for feature type
feature , feature_cos_score = feature_labeling(sentences,cos,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6)

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [24]:
df['Features'] = feature
df['F_score'] = feature_cos_score

In [25]:
df["Features"].value_counts()

Elastic modulus or Young’s modulus                          4052
Strain at break or strain at fracture or fracture strain    2333
Yield strength                                              1584
Weibull modulus                                             1315
Fracture toughness or Work of fracture                       502
Tensile strength or ultimate tensile strength                214
Name: Features, dtype: int64

In [26]:
df["Production/Material/Feature"] = df["Production"] + " / " + df["Material"] + " / " + df["Features"]

In [27]:
df.head()

Unnamed: 0,Documents/Abstracts,Production,P_score,Material,M_score,Production/Material,Features,F_score,Production/Material/Feature
0,we study the electronic states of giant single...,SLM or DMLS,0.51,Polymer,0.4,SLM or DMLS / Polymer,Elastic modulus or Young’s modulus,0.42,SLM or DMLS / Polymer / Elastic modulus or You...
1,the recursion and pathintegral methods are app...,FDM or FFF or EAM,0.55,Polymer,0.38,FDM or FFF or EAM / Polymer,Elastic modulus or Young’s modulus,0.49,FDM or FFF or EAM / Polymer / Elastic modulus ...
2,we analytically study phonon transmission and ...,FDM or FFF or EAM,0.58,Polymer,0.33,FDM or FFF or EAM / Polymer,Yield strength,0.43,FDM or FFF or EAM / Polymer / Yield strength
3,we study both analytically and numerically pho...,SLM or DMLS,0.59,Metal,0.43,SLM or DMLS / Metal,Strain at break or strain at fracture or fract...,0.46,SLM or DMLS / Metal / Strain at break or strai...
4,we present a model for thin film growth by par...,FDM or FFF or EAM,0.6,Polymer,0.38,FDM or FFF or EAM / Polymer,Yield strength,0.58,FDM or FFF or EAM / Polymer / Yield strength


In [28]:
df["Production/Material/Feature"].value_counts()

SLM or DMLS / Polymer / Elastic modulus or Young’s modulus                                1723
FDM or FFF or EAM / Polymer / Elastic modulus or Young’s modulus                          1439
SLM or DMLS / Metal / Strain at break or strain at fracture or fracture strain            1101
FDM or FFF or EAM / Polymer / Weibull modulus                                              626
SLM or DMLS / Metal / Yield strength                                                       595
SLM or DMLS / Polymer / Strain at break or strain at fracture or fracture strain           491
SLM or DMLS / Ceramic / Elastic modulus or Young’s modulus                                 420
SLM or DMLS / Polymer / Yield strength                                                     358
SLM or DMLS / Polymer / Weibull modulus                                                    286
SLM or DMLS / Ceramic / Strain at break or strain at fracture or fracture strain           282
FDM or FFF or EAM / Polymer / Yield strength      

In [29]:
df["M_score"].value_counts()

0.36    500
0.39    473
0.35    462
0.38    440
0.41    418
       ... 
0.76      1
0.10      1
0.08      1
0.78      1
0.70      1
Name: M_score, Length: 70, dtype: int64

In [30]:
df['M_score'].describe()

count    10000.000000
mean         0.384617
std          0.087080
min          0.080000
25%          0.330000
50%          0.380000
75%          0.440000
max          0.850000
Name: M_score, dtype: float64

In [31]:
df["M_score"].nunique()

70

In [32]:
X_axis = list("0" + 30 * " " + "0.425" + 30 * " "+ "0.85")

In [33]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker

plt.figure(figsize=(15,10))
sns.set(style="darkgrid")

ax = sns.countplot(x="M_score", data=df)
ax.xaxis.set_major_formatter(ticker.FixedFormatter(X_axis))
#ax.xaxis.set_major_locator(ticker.MultipleLocator(base=10))

In [34]:
df[df['M_score']== min(df['M_score'])]

Unnamed: 0,Documents/Abstracts,Production,P_score,Material,M_score,Production/Material,Features,F_score,Production/Material/Feature
7878,binding of atoms in a metallic chromium was in...,SLM or DMLS,0.11,Ceramic,0.08,SLM or DMLS / Ceramic,Weibull modulus,0.18,SLM or DMLS / Ceramic / Weibull modulus


In [35]:
df["Documents/Abstracts"][df['M_score'].idxmin()] # least cos score

'binding of atoms in a metallic chromium was investigated in a crfe alloy containing less than at fe enriched to in isotope using mossbauer spectroscopy the binding force was derived from the debye temperature td that in turn was calculated from the temperature dependence of the central shift of the mossbauer spectra recorded in the range of to k following a temperature dependence of the line width that shows a minimum at k two temperature intervals were considered a low temperature one lt ranging from to k and the td value of k or k and a high temperature one ht ranging from to k with the td value of or k depending on the fitting procedure the corresponding values of the harmonic force spring constant are nm and nm or nm and nm for the lt and ht respectively this means that in the ht range the binding force of atoms by the cr matrix is by a factor of stronger than that in the lt range this anomaly is possibly related with a different polarization of the spindensity waves in the lt and

In [36]:
df["Documents/Abstracts"][df['M_score'].idxmax()] # most cos score

'pfo powders in hexagonal structure have been synthesized by solgel process using lead acetate glycerin and ferric acetylacetonate as the precursor ceramics were obtained by sintering the powders at c for hour distorted flaky hexahedron grains are frequently observed in the sem images of sintered ceramics large spontaneous polarization was observed in ceramic at room temperature exhibiting a clear ferroelectric hysteresis loop the remnant polarization of ceramic is estimated to be the distortion of hexahedron grains as well as the fe oxygen octahedron in its perovskitelike hexagonal unit cell is proposed to be the origin of polarization in ceramics meanwhile ceramics demonstrate strong ferromagnetism at room temperature simultaneous occurrence of large ferroelectricity and strong ferromagnetism in ceramics holds promise for its application in new generation of electronic devices as a practical multiferroic candidate in single phase'

In [37]:
df[df['M_score']>0.35]['M_score'].describe()

count    6244.000000
mean        0.436957
std         0.060171
min         0.360000
25%         0.390000
50%         0.430000
75%         0.470000
max         0.850000
Name: M_score, dtype: float64

In [38]:
#df.to_csv("data labeling.csv")