In [1]:
import numpy as np
import pandas as pd
import pickle, os
import spacy
import fasttext

from tqdm.auto import tqdm, trange
from scipy.spatial.distance import cosine

from utils import embedding, get_embedding, text_cleaning, find_top_n , tfidf_sentences

from sklearn.feature_extraction.text import TfidfVectorizer

### Preprocessing

In [2]:
# Loading spacy nlp model
nlp = spacy.load("en_core_web_sm")

In [3]:
# Getting the titles from file names
title_list = []

for file in os.listdir("Local pdf text files"):
  if file.endswith(".txt"):
    title_list.append(file.split(".")[0])

In [4]:
# Getting the text from local text files
pdf_list = []
    
for file in os.listdir("Local pdf text files"):
  if file.endswith(".txt"):
    with open("Local pdf text files/"+file,"r",encoding="utf8") as f:
        pdf_list.append(f.read())

In [5]:
# cleaning text and titles
pdfs = list(map(lambda x: text_cleaning(x,nlp), pdf_list))
titles = list(map(lambda x: text_cleaning(x,nlp), title_list))

# lemmatization
pdfs = list(map(lambda x: ' '.join([token.lemma_ for token in x]), pdfs))
titles = list(map(lambda x: ' '.join([token.lemma_ for token in x]), titles))

# removing PRON after lemmatization
pdfs = list(map(lambda x: x.replace("-PRON-",""), pdfs))
titles = list(map(lambda x: x.replace("-PRON-",""), titles))

In [6]:
# Tfidf for abstracts
vec_abs = TfidfVectorizer(max_df=0.9,min_df=2,stop_words='english')
vec_abs.fit(pdfs)

# Tfidf for titles
vec_title = TfidfVectorizer(max_df=0.9,min_df=2,stop_words='english')
vec_title.fit(titles)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=None,
                min_df=2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

### Model Installation

In [7]:
path = "crawl-300d-2M.vec"

with open('training_data.txt','w',encoding="utf-8") as f:
    f.write('\n'.join(pdfs))

train_data = 'training_data.txt'

ft_model = fasttext.train_unsupervised(input=train_data, pretrainedVectors=path, dim=300)

In [8]:
# Saving the model
#ft_model.save_model("ft_model.bin")

# Loading the model
#ft_model = fasttext.load_model("ft_model.bin")

### Labels

In [9]:
# Creating sentences for each document with the strongest tfidf words

tfidf_words_abstract = []

for text in tqdm(pdfs):
    tfidf_words_abstract.append(" ".join(find_top_n(text,vec_abs,10)))
    
tfidf_words_title = []

for text in tqdm(titles):
    tfidf_words_title.append(" ".join(find_top_n(text,vec_title,5)))

HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




In [10]:
tfidf_words_abstract[:10]

['ra µm sample prepare ceramic base rheological transverse zirconia vicker',
 'glass cool firing salt size layer object lead different binder',
 'printing ceramic late technical component present stock come highperformance advancement',
 'fiber thermoplastic composite continuous unidirectional reinforcement carbon printer filament reinforce',
 'resource glass costly major need material print handling recycle stock',
 'manufacturing additive information technology industry make signiﬁcant cad process work',
 'manufacturing additive customize area review impact time supply chain healthcare',
 'code zirconia extrusion employ fabrication microstructural layer property stabilize pressure',
 'fiberreinforce plastic carbon deposition composite fused model tensile process modeling',
 'material progress field technology design formation new development ceramic base']

In [11]:
tfidf_words_title[:10]

['zirconia ceramic distribution electron evaluation',
 'finish surface printing glass print',
 'review print ceramic zirconia filament',
 'composite print zirconia distribution electron',
 'transparent glass print zirconia finish',
 'review manufacturing additive finish electron',
 'impact review manufacturing additive finish',
 'density zirconia characterization manufacturing additive',
 'tensile model effect fused deposition',
 'base material ceramic manufacturing additive']

In [12]:
production_1 = ft_model.get_sentence_vector('selective laser melting')
production_2 = ft_model.get_sentence_vector('direct metal laser sintering')

production_3 = ft_model.get_sentence_vector('fused deposition modeling')
production_4 = ft_model.get_sentence_vector('fused filament fabrication')
production_5 = ft_model.get_sentence_vector('extrusion based additive manufacturing')

metal = ft_model.get_sentence_vector('metal')
ceramic = ft_model.get_sentence_vector('ceramic')
polymer = ft_model.get_sentence_vector('polymer')

# feature_1 = get_embedding('fracture toughness')
# feature_2 = get_embedding('tensile strength')
# feature_3 = get_embedding('yield strength')
# feature_4 = get_embedding('elastic modulus')
# feature_5 = get_embedding('strain fracture break')
# feature_6 = get_embedding('weibull modulus')

In [13]:
def pro_labeling(doc):
    
    production = []
    production_cos_score = []
    
    for text in tqdm(doc):
        
        text = ft_model.get_sentence_vector(text)
        
        lbl= []

        lbl.append(1-cosine(text,production_1))
        lbl.append(1-cosine(text,production_2))
        lbl.append(1-cosine(text,production_3))
        lbl.append(1-cosine(text,production_4))
        lbl.append(1-cosine(text,production_5))

        if lbl.index(max(lbl))==0 or lbl.index(max(lbl))==1:
            production.append('SLM or DMLS')
            production_cos_score.append(max(lbl))
        else:
            production.append('FDM or FFF or EAM')
            production_cos_score.append(max(lbl))
    
    return production , production_cos_score

In [14]:
def mat_labeling(doc):

    material = []  
    material_cos_score = []

    for text in tqdm(doc):
        
        text = ft_model.get_sentence_vector(text)
        
        lbl= []

        lbl.append(1-cosine(text,metal))
        lbl.append(1-cosine(text,ceramic))
        lbl.append(1-cosine(text,polymer))

        if lbl.index(max(lbl))==0:
            material.append('Metal')
            material_cos_score.append(max(lbl))
        elif lbl.index(max(lbl))==1:
            material.append('Ceramic')
            material_cos_score.append(max(lbl))
        else:
            material.append('Polymer')
            material_cos_score.append(max(lbl))
    
    return material , material_cos_score

In [15]:
def create_df_abs():
    
    d_abs = {'Titles':title_list,
         'Abstracts':pdf_list,
         'Abs_Production':abs_production,
         'Abs_Production_score':abs_production_cos_score,
         'Abs_Material':abs_material,
         'Abs_Material_score':abs_material_cos_score
        }
    
    df = pd.DataFrame(d_abs)
    #df["Production/Material"] = df["Abs_Production"] + " / " + df["Abs_Material"]
    
    return df

In [16]:
def create_df_title():
    
    d_title = {'Titles':title_list,
         'Abstracts':pdf_list,
         'Title_Production':title_production,
         'Title_Production_score':title_production_cos_score,
         'Title_Material':title_material,
         'Title_Material_score':title_material_cos_score
        }
    
    df = pd.DataFrame(d_title)
    #df["Production/Material"] = df["Title_Production"] + " / " + df["Title_Material"]
    
    return df

### Labeling with abstracts and Titles

In [17]:
abs_production , abs_production_cos_score = pro_labeling(tfidf_words_abstract)
abs_material , abs_material_cos_score = mat_labeling(tfidf_words_abstract)
df_abs = create_df_abs()

title_production , title_production_cos_score = pro_labeling(tfidf_words_title)
title_material , title_material_cos_score = mat_labeling(tfidf_words_title)
df_title = create_df_title()

HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




HBox(children=(IntProgress(value=0, max=99), HTML(value='')))




In [18]:
df_abs["Abs_Material"].value_counts()

Ceramic    58
Polymer    25
Metal      16
Name: Abs_Material, dtype: int64

In [19]:
df_title["Title_Material"].value_counts()

Ceramic    63
Polymer    22
Metal      14
Name: Title_Material, dtype: int64

In [20]:
df_abs.head()

Unnamed: 0,Titles,Abstracts,Abs_Production,Abs_Production_score,Abs_Material,Abs_Material_score
0,3D gel-printing of zirconia ceramic parts,3D gel-printing (3DGP) is a new printing metho...,FDM or FFF or EAM,0.861564,Ceramic,0.74675
1,3D Printed Glass Surface Finish and Bulk Prope...,It is impossible to print glass directly from ...,FDM or FFF or EAM,0.832146,Ceramic,0.691518
2,3D printing of ceramics A review,Along with extensive research on the three-dim...,FDM or FFF or EAM,0.868318,Ceramic,0.779559
3,3D Printing of Continuous-Fiber Composites by ...,We have developed a method for the three-dimen...,FDM or FFF or EAM,0.86061,Polymer,0.730466
4,3D Printing of Transparent Glass,Traditional assembly line manufacturing is spe...,FDM or FFF or EAM,0.839091,Polymer,0.692131


In [21]:
df_title.head()

Unnamed: 0,Titles,Abstracts,Title_Production,Title_Production_score,Title_Material,Title_Material_score
0,3D gel-printing of zirconia ceramic parts,3D gel-printing (3DGP) is a new printing metho...,FDM or FFF or EAM,0.836623,Ceramic,0.791198
1,3D Printed Glass Surface Finish and Bulk Prope...,It is impossible to print glass directly from ...,FDM or FFF or EAM,0.81119,Ceramic,0.681929
2,3D printing of ceramics A review,Along with extensive research on the three-dim...,FDM or FFF or EAM,0.863827,Ceramic,0.838977
3,3D Printing of Continuous-Fiber Composites by ...,We have developed a method for the three-dimen...,FDM or FFF or EAM,0.837679,Ceramic,0.702488
4,3D Printing of Transparent Glass,Traditional assembly line manufacturing is spe...,FDM or FFF or EAM,0.834935,Ceramic,0.699891


In [22]:
# df_2 = df_2.drop('que',axis=1)

In [23]:
# def que(x):
#     if x['Material_title'] == x['Material_abstract']:
#         return True
#     else:
#         return False
    
# df_2['que'] = df_2.apply(que, axis=1)

In [24]:
# df_2['que'].apply(lambda x:"x['que']" ,if x =="Ceramic",axis=1)

In [25]:
# sum(df_2['que'])

In [26]:
# df_2 = df_2[['Titles','Abstracts','Material_abstract','Material_abstract_score','Material_title','Material_title_score','que']]

In [27]:
# df_2 = df_2.drop(['que'],axis=1)

In [28]:
# df_2.to_excel("recent_results.xlsx")

In [29]:
# def r(x):
#     return round(x,2)

# df_2["Material_title_score"] = df_2["Material_title_score"].apply(lambda x:round(x,2))
# df_2["Production_title_score"] = df_2["Production_title_score"].apply(lambda x:round(x,2))