In [27]:
import numpy as np
import pandas as pd
import pickle, os
import spacy
import fasttext

from tqdm.auto import tqdm, trange
from scipy.spatial.distance import cosine

from utils import embedding, get_embedding, text_cleaning, find_top_n , tfidf_sentences

from sklearn.feature_extraction.text import TfidfVectorizer

### Preprocessing

In [3]:
# Loading spacy nlp model
nlp = spacy.load("en_core_web_sm")

In [8]:
# Getting the titles from file names
title_list = []

for file in os.listdir("Local pdf text files"):
  if file.endswith(".txt"):
    title_list.append(file.split(".")[0])

In [9]:
# Getting the text from local text files
pdf_list = []
    
for file in os.listdir("Local pdf text files"):
  if file.endswith(".txt"):
    with open("Local pdf text files/"+file,"r",encoding="utf8") as f:
        pdf_list.append(f.read())

In [20]:
# cleaning text and titles
pdfs = list(map(lambda x: text_cleaning(x,nlp), pdf_list))
titles = list(map(lambda x: text_cleaning(x,nlp), title_list))

# lemmatization
pdfs = list(map(lambda x: ' '.join([token.lemma_ for token in x]), pdfs))
titles = list(map(lambda x: ' '.join([token.lemma_ for token in x]), titles))

# removing PRON after lemmatization
pdfs = list(map(lambda x: x.replace("-PRON-",""), pdfs))
titles = list(map(lambda x: x.replace("-PRON-",""), titles))

In [23]:
# Tfidf for abstracts
vec_abs = TfidfVectorizer(max_df=0.9,min_df=2,stop_words='english')
vec_abs.fit(pdfs)

# Tfidf for titles
vec_title = TfidfVectorizer(max_df=0.9,min_df=2,stop_words='english')
vec_title.fit(titles)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=None,
                min_df=2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

### Model Installation

In [25]:
path = "crawl-300d-2M-subword.vec"

with open('training_data.txt','w',encoding="utf-8") as f:
    f.write('\n'.join(pdfs))

train_data = 'training_data.txt'

ft_model = fasttext.train_unsupervised(input=train_data, pretrainedVectors=path, dim=300)

In [26]:
# Saving the model
# ft_model.save_model("ft_model.bin")

# Loading the model
# ft_model = fasttext.load_model("ft_model.bin")

### Labels

In [29]:
# Creating sentences for each document with the strongest tfidf words

tfidf_words_abstract = []

for text in tqdm(pdfs):
    tfidf_words_abstract.append(" ".join(find_top_n(text,vec_abs,10)))
    
tfidf_words_title = []

for text in tqdm(titles):
    tfidf_words_title.append(" ".join(find_top_n(text,vec_title,5)))

HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




In [30]:
production_1 = ft_model.get_sentence_vector('selective laser melting')
production_2 = ft_model.get_sentence_vector('direct metal laser sintering')

production_3 = ft_model.get_sentence_vector('fused deposition modeling')
production_4 = ft_model.get_sentence_vector('fused filament fabrication')
production_5 = ft_model.get_sentence_vector('extrusion based additive manufacturing')

metal = ft_model.get_sentence_vector('metal')
ceramic = ft_model.get_sentence_vector('ceramic')
polymer = ft_model.get_sentence_vector('polymer')

# feature_1 = get_embedding('fracture toughness')
# feature_2 = get_embedding('tensile strength')
# feature_3 = get_embedding('yield strength')
# feature_4 = get_embedding('elastic modulus')
# feature_5 = get_embedding('strain fracture break')
# feature_6 = get_embedding('weibull modulus')

In [42]:
def pro_labeling(doc):
    
    production = []
    production_cos_score = []
    
    for text in tqdm(doc):
        
        text = ft_model.get_sentence_vector(text)
        
        lbl= []

        lbl.append(1-cosine(text,production_1))
        lbl.append(1-cosine(text,production_2))
        lbl.append(1-cosine(text,production_3))
        lbl.append(1-cosine(text,production_4))
        lbl.append(1-cosine(text,production_5))

        if lbl.index(max(lbl))==0 or lbl.index(max(lbl))==1:
            production.append('SLM or DMLS')
            production_cos_score.append(max(lbl))
        else:
            production.append('FDM or FFF or EAM')
            production_cos_score.append(max(lbl))
    
    return production , production_cos_score

In [43]:
def mat_labeling(doc):

    material = []  
    material_cos_score = []

    for text in tqdm(doc):
        
        text = ft_model.get_sentence_vector(text)
        
        lbl= []

        lbl.append(1-cosine(text,metal))
        lbl.append(1-cosine(text,ceramic))
        lbl.append(1-cosine(text,polymer))

        if lbl.index(max(lbl))==0:
            material.append('Metal')
            material_cos_score.append(max(lbl))
        elif lbl.index(max(lbl))==1:
            material.append('Ceramic')
            material_cos_score.append(max(lbl))
        else:
            material.append('Polymer')
            material_cos_score.append(max(lbl))
    
    return material , material_cos_score

In [37]:
def create_df_abs():
    
    d = {'Titles':title_list,
         'Abstracts':pdf_list,
         'Abs_Production':abs_production,
         'Abs_Production_score':abs_production_cos_score,
         'Abs_Material':abs_material,
         'Abs_Material_score':abs_material_cos_score
        }
    
    df = pd.DataFrame(d)
    #df["Production/Material"] = df["Abs_Production"] + " / " + df["Abs_Material"]
    
    return df

In [45]:
def create_df_title():
    
    d = {'Titles':title_list,
         'Abstracts':pdf_list,
         'Title_Production':title_production,
         'Title_Production_score':title_production_cos_score,
         'Title_Material':title_material,
         'Title_Material_score':title_material_cos_score
        }
    
    df = pd.DataFrame(d)
    #df["Production/Material"] = df["Title_Production"] + " / " + df["Title_Material"]
    
    return df

### Labeling with abstracts and Titles

In [46]:
abs_production , abs_production_cos_score = pro_labeling(tfidf_words_abstract)
abs_material , abs_material_cos_score = mat_labeling(tfidf_words_abstract)
df_abs = create_df_abs()

title_production , title_production_cos_score = pro_labeling(tfidf_words_abstract)
title_material , title_material_cos_score = mat_labeling(tfidf_words_abstract)
df_title = create_df_title()

HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




In [51]:
df_abs["Abs_Material"].value_counts()

KeyError: 'Abs_Material'

In [48]:
df_title["Title_Material"].value_counts()

Polymer    42
Ceramic    40
Metal      17
Name: Title_Material, dtype: int64

In [39]:
df_1 = create_df_abs()

NameError: name 'create_df_title' is not defined

In [50]:
df_title.head(20)

Unnamed: 0,Titles,Abstracts,Title_Production,Title_Production_score,Title_Material,Title_Material_score
0,3D gel-printing of zirconia ceramic parts,3D gel-printing (3DGP) is a new printing metho...,FDM or FFF or EAM,0.86123,Ceramic,0.74795
1,3D Printed Glass Surface Finish and Bulk Prope...,It is impossible to print glass directly from ...,FDM or FFF or EAM,0.835402,Polymer,0.6921
2,3D printing of ceramics A review,Along with extensive research on the three-dim...,FDM or FFF or EAM,0.876732,Ceramic,0.778091
3,3D Printing of Continuous-Fiber Composites by ...,We have developed a method for the three-dimen...,FDM or FFF or EAM,0.849378,Polymer,0.750116
4,3D Printing of Transparent Glass,Traditional assembly line manufacturing is spe...,FDM or FFF or EAM,0.843707,Polymer,0.687378
5,A Review of Additive Manufacturing,Additive manufacturing processes take the info...,FDM or FFF or EAM,0.921565,Ceramic,0.689773
6,Additive manufacturing and its societal impact...,"Thirty years into its development, additive ma...",FDM or FFF or EAM,0.895693,Polymer,0.691979
7,Additive manufacturing and mechanical characte...,Mechanical properties of additively manufactur...,FDM or FFF or EAM,0.895395,Ceramic,0.719301
8,Additive manufacturing of carbonfiber-reinforc...,Carbon fiber-reinforced plastic composites hav...,FDM or FFF or EAM,0.925232,Polymer,0.699238
9,Additive Manufacturing of Ceramic Based Materials,This paper offers a review of present achievem...,FDM or FFF or EAM,0.888089,Ceramic,0.754961


In [19]:
df_1.head(20)

Unnamed: 0,Titles,Abstracts,Material_title,Material_title_score
0,3D gel-printing of zirconia ceramic parts,3D gel-printing (3DGP) is a new printing metho...,Ceramic,0.748928
1,3D Printed Glass Surface Finish and Bulk Prope...,It is impossible to print glass directly from ...,Ceramic,0.686253
2,3D printing of ceramics A review,Along with extensive research on the three-dim...,Ceramic,0.778083
3,3D Printing of Continuous-Fiber Composites by ...,We have developed a method for the three-dimen...,Polymer,0.738717
4,3D Printing of Transparent Glass,Traditional assembly line manufacturing is spe...,Polymer,0.698467
5,A Review of Additive Manufacturing,Additive manufacturing processes take the info...,Ceramic,0.692488
6,Additive manufacturing and its societal impact...,"Thirty years into its development, additive ma...",Polymer,0.679706
7,Additive manufacturing and mechanical characte...,Mechanical properties of additively manufactur...,Ceramic,0.730757
8,Additive manufacturing of carbonfiber-reinforc...,Carbon fiber-reinforced plastic composites hav...,Polymer,0.698454
9,Additive Manufacturing of Ceramic Based Materials,This paper offers a review of present achievem...,Ceramic,0.759091


Title

In [20]:
material , material_cos_score = material_labeling(fast_tfidf_2)

HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




In [21]:
df_2 = create_df_title()
df_2["Material_title"].value_counts()

Ceramic    38
Polymer    36
Metal      25
Name: Material_title, dtype: int64

In [22]:
df_2.head(20)

Unnamed: 0,Titles,Abstracts,Material_title,Material_title_score
0,3D gel-printing of zirconia ceramic parts,3D gel-printing (3DGP) is a new printing metho...,Ceramic,0.811637
1,3D Printed Glass Surface Finish and Bulk Prope...,It is impossible to print glass directly from ...,Polymer,0.669838
2,3D printing of ceramics A review,Along with extensive research on the three-dim...,Ceramic,0.774375
3,3D Printing of Continuous-Fiber Composites by ...,We have developed a method for the three-dimen...,Metal,0.668096
4,3D Printing of Transparent Glass,Traditional assembly line manufacturing is spe...,Polymer,0.684957
5,A Review of Additive Manufacturing,Additive manufacturing processes take the info...,Ceramic,0.663333
6,Additive manufacturing and its societal impact...,"Thirty years into its development, additive ma...",Ceramic,0.674304
7,Additive manufacturing and mechanical characte...,Mechanical properties of additively manufactur...,Polymer,0.677085
8,Additive manufacturing of carbonfiber-reinforc...,Carbon fiber-reinforced plastic composites hav...,Polymer,0.687831
9,Additive Manufacturing of Ceramic Based Materials,This paper offers a review of present achievem...,Ceramic,0.788638


In [23]:
df_2['Material_abstract'] = df_1['Material_title']
df_2['Material_abstract_score'] = df_1['Material_title_score']

In [24]:
df_2.head()

Unnamed: 0,Titles,Abstracts,Material_title,Material_title_score,Material_abstract,Material_abstract_score
0,3D gel-printing of zirconia ceramic parts,3D gel-printing (3DGP) is a new printing metho...,Ceramic,0.811637,Ceramic,0.748928
1,3D Printed Glass Surface Finish and Bulk Prope...,It is impossible to print glass directly from ...,Polymer,0.669838,Ceramic,0.686253
2,3D printing of ceramics A review,Along with extensive research on the three-dim...,Ceramic,0.774375,Ceramic,0.778083
3,3D Printing of Continuous-Fiber Composites by ...,We have developed a method for the three-dimen...,Metal,0.668096,Polymer,0.738717
4,3D Printing of Transparent Glass,Traditional assembly line manufacturing is spe...,Polymer,0.684957,Polymer,0.698467


In [28]:
df_2 = df_2.drop('que',axis=1)

In [29]:
def que(x):
    if x['Material_title'] == x['Material_abstract']:
        return True
    else:
        return False
    
df_2['que'] = df_2.apply(que, axis=1)

In [None]:
# df_2['que'].apply(lambda x:"x['que']" ,if x =="Ceramic",axis=1)

In [None]:
# sum(df_2['que'])

In [30]:
df_2.head()

Unnamed: 0,Titles,Abstracts,Material_title,Material_title_score,Material_abstract,Material_abstract_score,que
0,3D gel-printing of zirconia ceramic parts,3D gel-printing (3DGP) is a new printing metho...,Ceramic,0.811637,Ceramic,0.748928,True
1,3D Printed Glass Surface Finish and Bulk Prope...,It is impossible to print glass directly from ...,Polymer,0.669838,Ceramic,0.686253,False
2,3D printing of ceramics A review,Along with extensive research on the three-dim...,Ceramic,0.774375,Ceramic,0.778083,True
3,3D Printing of Continuous-Fiber Composites by ...,We have developed a method for the three-dimen...,Metal,0.668096,Polymer,0.738717,False
4,3D Printing of Transparent Glass,Traditional assembly line manufacturing is spe...,Polymer,0.684957,Polymer,0.698467,True


In [32]:
df_2 = df_2[['Titles','Abstracts','Material_abstract','Material_abstract_score','Material_title','Material_title_score','que']]

In [36]:
df_2.head(35)

Unnamed: 0,Titles,Abstracts,Material_abstract,Material_abstract_score,Material_title,Material_title_score,que
0,3D gel-printing of zirconia ceramic parts,3D gel-printing (3DGP) is a new printing metho...,Ceramic,0.748928,Ceramic,0.811637,True
1,3D Printed Glass Surface Finish and Bulk Prope...,It is impossible to print glass directly from ...,Ceramic,0.686253,Polymer,0.669838,False
2,3D printing of ceramics A review,Along with extensive research on the three-dim...,Ceramic,0.778083,Ceramic,0.774375,True
3,3D Printing of Continuous-Fiber Composites by ...,We have developed a method for the three-dimen...,Polymer,0.738717,Metal,0.668096,False
4,3D Printing of Transparent Glass,Traditional assembly line manufacturing is spe...,Polymer,0.698467,Polymer,0.684957,True
5,A Review of Additive Manufacturing,Additive manufacturing processes take the info...,Ceramic,0.692488,Ceramic,0.663333,True
6,Additive manufacturing and its societal impact...,"Thirty years into its development, additive ma...",Polymer,0.679706,Ceramic,0.674304,False
7,Additive manufacturing and mechanical characte...,Mechanical properties of additively manufactur...,Ceramic,0.730757,Polymer,0.677085,False
8,Additive manufacturing of carbonfiber-reinforc...,Carbon fiber-reinforced plastic composites hav...,Polymer,0.698454,Polymer,0.687831,True
9,Additive Manufacturing of Ceramic Based Materials,This paper offers a review of present achievem...,Ceramic,0.759091,Ceramic,0.788638,True


In [42]:
df_2 = df_2.drop(['que'],axis=1)

In [43]:
df_2

Unnamed: 0,Titles,Abstracts,Material_title,Material_title_score
0,3D gel-printing of zirconia ceramic parts,3D gel-printing (3DGP) is a new printing metho...,Ceramic,0.811637
1,3D Printed Glass Surface Finish and Bulk Prope...,It is impossible to print glass directly from ...,Polymer,0.669838
2,3D printing of ceramics A review,Along with extensive research on the three-dim...,Ceramic,0.774375
3,3D Printing of Continuous-Fiber Composites by ...,We have developed a method for the three-dimen...,Metal,0.668096
4,3D Printing of Transparent Glass,Traditional assembly line manufacturing is spe...,Polymer,0.684957
...,...,...,...,...
94,"The guide to glass 3D printing developments, m...",Purpose – This purpose of this paper is to pro...,Ceramic,0.694928
95,The impact of process parameters on mechanical...,Purpose – This study aims to quantify the ulti...,Polymer,0.700567
96,Thermo-mechanical Characterization of MetalPol...,New metal/polymer composite filaments for fuse...,Ceramic,0.724027
97,Three-dimensional printed strontium-containing...,The development of a new generation of biomate...,Ceramic,0.664514


In [51]:
def production_labeling(doc):
    
    production = []
    production_cos_score = []
    
    for text in tqdm(doc):
        
        text = ft_model.get_sentence_vector(text)
        
        lbl= []

        lbl.append(1-cosine(text,label_1))
        lbl.append(1-cosine(text,label_2))
        lbl.append(1-cosine(text,label_3))
        lbl.append(1-cosine(text,label_4))
        lbl.append(1-cosine(text,label_5))

        if lbl.index(max(lbl))==0 or lbl.index(max(lbl))==1:
            production.append('SLM or DMLS')
            production_cos_score.append(max(lbl))
        else:
            production.append('FDM or FFF or EAM')
            production_cos_score.append(max(lbl))
    
    # Finding cosine scores
    #production_cos_score = list(map(float,production_cos_score))
    #production_cos_score = list(map(lambda x: round(x,2), production_cos_score)) 
    
    return production , production_cos_score

In [52]:
production , production_cos_score = production_labeling(fast_tfidf_2)

HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




In [53]:
df_2['Production_title'] = production

In [54]:
df_2['Production_title_score'] = production_cos_score

In [57]:
df_2.head(35)

Unnamed: 0,Titles,Abstracts,Material_title,Material_title_score,Production_title,Production_title_score
0,3D gel-printing of zirconia ceramic parts,3D gel-printing (3DGP) is a new printing metho...,Ceramic,0.811637,FDM or FFF or EAM,0.851035
1,3D Printed Glass Surface Finish and Bulk Prope...,It is impossible to print glass directly from ...,Polymer,0.669838,FDM or FFF or EAM,0.828517
2,3D printing of ceramics A review,Along with extensive research on the three-dim...,Ceramic,0.774375,FDM or FFF or EAM,0.824353
3,3D Printing of Continuous-Fiber Composites by ...,We have developed a method for the three-dimen...,Metal,0.668096,FDM or FFF or EAM,0.808019
4,3D Printing of Transparent Glass,Traditional assembly line manufacturing is spe...,Polymer,0.684957,FDM or FFF or EAM,0.828058
5,A Review of Additive Manufacturing,Additive manufacturing processes take the info...,Ceramic,0.663333,FDM or FFF or EAM,0.925872
6,Additive manufacturing and its societal impact...,"Thirty years into its development, additive ma...",Ceramic,0.674304,FDM or FFF or EAM,0.910155
7,Additive manufacturing and mechanical characte...,Mechanical properties of additively manufactur...,Polymer,0.677085,FDM or FFF or EAM,0.79803
8,Additive manufacturing of carbonfiber-reinforc...,Carbon fiber-reinforced plastic composites hav...,Polymer,0.687831,FDM or FFF or EAM,0.922813
9,Additive Manufacturing of Ceramic Based Materials,This paper offers a review of present achievem...,Ceramic,0.788638,FDM or FFF or EAM,0.954266


In [56]:
df_2['Production_title'].value_counts()

FDM or FFF or EAM    69
SLM or DMLS          30
Name: Production_title, dtype: int64

In [58]:
df_2.to_excel("recent_results.xlsx")

In [70]:
def r(x):
    return round(x,2)

df_2["Material_title_score"] = df_2["Material_title_score"].apply(lambda x:round(x,2))
df_2["Production_title_score"] = df_2["Production_title_score"].apply(lambda x:round(x,2))

In [71]:
df_2.head()

Unnamed: 0,Titles,Abstracts,Material_title,Material_title_score,Production_title,Production_title_score
0,3D gel-printing of zirconia ceramic parts,3D gel-printing (3DGP) is a new printing metho...,Ceramic,0.81,FDM or FFF or EAM,0.85
1,3D Printed Glass Surface Finish and Bulk Prope...,It is impossible to print glass directly from ...,Polymer,0.67,FDM or FFF or EAM,0.83
2,3D printing of ceramics A review,Along with extensive research on the three-dim...,Ceramic,0.77,FDM or FFF or EAM,0.82
3,3D Printing of Continuous-Fiber Composites by ...,We have developed a method for the three-dimen...,Metal,0.67,FDM or FFF or EAM,0.81
4,3D Printing of Transparent Glass,Traditional assembly line manufacturing is spe...,Polymer,0.68,FDM or FFF or EAM,0.83


In [72]:
df_2.to_excel("recent_results.xlsx")