In [1]:
import numpy as np
import pandas as pd
import pickle
import torch
import re
import spacy
import os

from tqdm.auto import tqdm, trange

from utils import embedding, get_embedding, text_cleaning, find_top_n , tfidf_sentences
from utils import production_labeling, material_labeling, feature_labeling 

from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings
from flair.data import Sentence
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
# Loading spacy nlp model
nlp = spacy.load("en_core_web_sm")

In [3]:
title_list = []

for file in os.listdir("Local pdf text files"):
  if file.endswith(".txt"):
    title_list.append(file.split(".")[0])

In [4]:
pdf_list = []
    
for file in os.listdir("Local pdf text files"):
  if file.endswith(".txt"):
    with open("Local pdf text files/"+file,"r",encoding="utf8") as f:
        pdf_list.append(f.read())

In [5]:
pdfs = list(map(lambda x: text_cleaning(x,nlp), pdf_list))

In [6]:
pdf_list_lemma = []

for text in pdfs:
    pdf_list_lemma.append(' '.join([token.lemma_ for token in text]))

In [7]:
vectorizer = TfidfVectorizer(max_df=0.9,min_df=2,stop_words='english')
vectorizer.fit(pdf_list_lemma)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=None,
                min_df=2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [8]:
cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)

In [9]:
label_1 = get_embedding('selective laser melting')
label_2 = get_embedding('direct metal laser sintering')

label_3 = get_embedding('fused deposition modeling')
label_4 = get_embedding('fused filament fabrication')
label_5 = get_embedding('extrusion based additive manufacturing')

metal = get_embedding('metal')
ceramic = get_embedding('ceramic')
polymer = get_embedding('polymer')

feature_1 = get_embedding('fracture toughness')
feature_2 = get_embedding('tensile strength')
feature_3 = get_embedding('yield strength')
feature_4 = get_embedding('elastic modulus')
feature_5 = get_embedding('strain fracture break')
feature_6 = get_embedding('weibull modulus')



In [10]:
def create_df_abstract(documents,tfidf_n):
    
    sentences = tfidf_sentences(documents,vectorizer,tfidf_n)
    abstract_production , abstract_production_cos_score = production_labeling(sentences,cos,label_1,label_2,label_3,label_4,label_5)
    abstract_material , abstract_material_cos_score = material_labeling(sentences,cos,metal,ceramic,polymer)
    
    d = {'Titles':title_list,
         'Abstracts':pdf_list,
         'Production_abstract':abstract_production,
         'Production_abstract_score':abstract_production_cos_score,
         'Material_abstract':abstract_material,
         'Material_abstract_score':abstract_material_cos_score}
    
    df = pd.DataFrame(d)
    #df["Production/Material"] = df["Production_abstract"] + " / " + df["Material_abstract"]
    
    return df

In [11]:
abstract_df = create_df_abstract(pdf_list_lemma,3)

HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




In [12]:
abstract_df.head(5)

Unnamed: 0,Titles,Abstracts,Production_abstract,Production_abstract_score,Material_abstract,Material_abstract_score
0,3D gel-printing of zirconia ceramic parts,3D gel-printing (3DGP) is a new printing metho...,SLM or DMLS,0.48,Ceramic,0.4
1,3D Printed Glass Surface Finish and Bulk Prope...,It is impossible to print glass directly from ...,SLM or DMLS,0.73,Metal,0.63
2,3D printing of ceramics A review,Along with extensive research on the three-dim...,SLM or DMLS,0.75,Ceramic,0.74
3,3D Printing of Continuous-Fiber Composites by ...,We have developed a method for the three-dimen...,FDM or FFF or EAM,0.7,Polymer,0.68
4,3D Printing of Transparent Glass,Traditional assembly line manufacturing is spe...,SLM or DMLS,0.72,Ceramic,0.61


In [13]:
abstract_df["Production_abstract"].value_counts()

SLM or DMLS          51
FDM or FFF or EAM    48
Name: Production_abstract, dtype: int64

In [14]:
abstract_df["Material_abstract"].value_counts()

Polymer    40
Metal      38
Ceramic    21
Name: Material_abstract, dtype: int64

TITLES

In [15]:
titles = list(map(lambda x: text_cleaning(x,nlp), title_list))

In [16]:
title_list[80]

'Selective laser melting of stainless steel and alumina composite Experimental and simulation studies on processing parameters, microstructure and mechanical properties'

In [17]:
title_list_lemma = []

for text in titles:
    title_list_lemma.append(' '.join([token.lemma_ for token in text]))

In [18]:
titles[80]

selective laser melting of stainless steel and alumina composite experimental and simulation studies on processing parameters microstructure and mechanical properties

In [19]:
title_list_lemma[80]

'selective laser melting of stainless steel and alumina composite experimental and simulation study on process parameter microstructure and mechanical property'

In [20]:
vectorizer = TfidfVectorizer(max_df=0.9,min_df=2,stop_words='english')
vectorizer.fit(title_list_lemma)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=None,
                min_df=2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [21]:
def create_df_title(documents,tfidf_n):
    
    sentences = tfidf_sentences(documents,vectorizer,tfidf_n)
    title_production , title_production_cos_score = production_labeling(sentences,cos,label_1,label_2,label_3,label_4,label_5)
    title_material , title_material_cos_score = material_labeling(sentences,cos,metal,ceramic,polymer)
    
    d = {'Titles':title_list,
         'Abstracts':pdf_list,
         'Production_title':title_production,
         'Production_title_score':title_production_cos_score,
         'Material_title':title_material,
         'Material_title_score':title_material_cos_score}
    
    df = pd.DataFrame(d)
    #df["Production/Material"] = df["Production_title"] + " / " + df["Material_title"]
    
    return df

In [22]:
title_df = create_df_title(title_list_lemma,3)

HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




In [23]:
title_df["Production_title"].value_counts()

FDM or FFF or EAM    60
SLM or DMLS          39
Name: Production_title, dtype: int64

In [24]:
title_df["Material_title"].value_counts()

Polymer    42
Metal      30
Ceramic    27
Name: Material_title, dtype: int64

In [25]:
title_df.head()

Unnamed: 0,Titles,Abstracts,Production_title,Production_title_score,Material_title,Material_title_score
0,3D gel-printing of zirconia ceramic parts,3D gel-printing (3DGP) is a new printing metho...,FDM or FFF or EAM,0.74,Ceramic,0.74
1,3D Printed Glass Surface Finish and Bulk Prope...,It is impossible to print glass directly from ...,SLM or DMLS,0.7,Metal,0.57
2,3D printing of ceramics A review,Along with extensive research on the three-dim...,SLM or DMLS,0.68,Ceramic,0.71
3,3D Printing of Continuous-Fiber Composites by ...,We have developed a method for the three-dimen...,SLM or DMLS,0.68,Ceramic,0.63
4,3D Printing of Transparent Glass,Traditional assembly line manufacturing is spe...,SLM or DMLS,0.69,Ceramic,0.63


In [26]:
result = pd.concat([abstract_df, title_df[["Production_title","Production_title_score","Material_title","Material_title_score"]]], axis=1, sort=False)

In [27]:
result.head()

Unnamed: 0,Titles,Abstracts,Production_abstract,Production_abstract_score,Material_abstract,Material_abstract_score,Production_title,Production_title_score,Material_title,Material_title_score
0,3D gel-printing of zirconia ceramic parts,3D gel-printing (3DGP) is a new printing metho...,SLM or DMLS,0.48,Ceramic,0.4,FDM or FFF or EAM,0.74,Ceramic,0.74
1,3D Printed Glass Surface Finish and Bulk Prope...,It is impossible to print glass directly from ...,SLM or DMLS,0.73,Metal,0.63,SLM or DMLS,0.7,Metal,0.57
2,3D printing of ceramics A review,Along with extensive research on the three-dim...,SLM or DMLS,0.75,Ceramic,0.74,SLM or DMLS,0.68,Ceramic,0.71
3,3D Printing of Continuous-Fiber Composites by ...,We have developed a method for the three-dimen...,FDM or FFF or EAM,0.7,Polymer,0.68,SLM or DMLS,0.68,Ceramic,0.63
4,3D Printing of Transparent Glass,Traditional assembly line manufacturing is spe...,SLM or DMLS,0.72,Ceramic,0.61,SLM or DMLS,0.69,Ceramic,0.63


In [28]:
result["Mean_product"] = (result["Production_abstract_score"] + result["Production_title_score"])/2
result["Mean_material"] = (result["Material_abstract_score"] + result["Material_title_score"])/2

In [29]:
list(result.columns.values)

['Titles',
 'Abstracts',
 'Production_abstract',
 'Production_abstract_score',
 'Material_abstract',
 'Material_abstract_score',
 'Production_title',
 'Production_title_score',
 'Material_title',
 'Material_title_score',
 'Mean_product',
 'Mean_material']

In [30]:
result = result[['Titles', 
                 'Abstracts', 
                 'Production_abstract', 
                 'Production_abstract_score',
                 'Production_title',
                 'Production_title_score',
                 'Mean_product',
                 'Material_abstract',
                 'Material_abstract_score',
                 'Material_title',
                 'Material_title_score',
                 'Mean_material'
                ]]

In [31]:
result.head(5)

Unnamed: 0,Titles,Abstracts,Production_abstract,Production_abstract_score,Production_title,Production_title_score,Mean_product,Material_abstract,Material_abstract_score,Material_title,Material_title_score,Mean_material
0,3D gel-printing of zirconia ceramic parts,3D gel-printing (3DGP) is a new printing metho...,SLM or DMLS,0.48,FDM or FFF or EAM,0.74,0.61,Ceramic,0.4,Ceramic,0.74,0.57
1,3D Printed Glass Surface Finish and Bulk Prope...,It is impossible to print glass directly from ...,SLM or DMLS,0.73,SLM or DMLS,0.7,0.715,Metal,0.63,Metal,0.57,0.6
2,3D printing of ceramics A review,Along with extensive research on the three-dim...,SLM or DMLS,0.75,SLM or DMLS,0.68,0.715,Ceramic,0.74,Ceramic,0.71,0.725
3,3D Printing of Continuous-Fiber Composites by ...,We have developed a method for the three-dimen...,FDM or FFF or EAM,0.7,SLM or DMLS,0.68,0.69,Polymer,0.68,Ceramic,0.63,0.655
4,3D Printing of Transparent Glass,Traditional assembly line manufacturing is spe...,SLM or DMLS,0.72,SLM or DMLS,0.69,0.705,Ceramic,0.61,Ceramic,0.63,0.62
