In [1]:
import numpy as np
import pandas as pd
import pickle
import torch
from tqdm.auto import tqdm, trange

from utils import embedding, find_top_n , tfidf_sentences, production_labeling, material_labeling, feature_labeling, get_embedding 

from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentPoolEmbeddings
from flair.data import Sentence
from sklearn.feature_extraction.text import TfidfVectorizer

with open('abstract_list.pkl', 'rb') as f:
    abstract_list = pickle.load(f)
with open('title_list.pkl', 'rb') as f:
    title_list = pickle.load(f)

In [2]:
vectorizer = TfidfVectorizer(max_df=0.9,min_df=2,stop_words='english')
vectorizer.fit(abstract_list)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=None,
                min_df=2, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [3]:
cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)

In [4]:
# Creating flair embedding
e4 = embedding()

In [None]:
# returns flair embedding of a text in torch tensor format
def get_embedding(text):
    sentence = Sentence(text)
    e4.embed(sentence)
    return sentence.embedding

In [None]:
#tfidf = []

# Concatenating tf-idf words into sentences
#for text in abstract_list:
#    tfidf.append(" ".join(find_top_n(text,vectorizer,3)))

In [None]:
# embeddings for concatenated tf-idf sentences
#tfidf_sentences = [get_embedding(text) for text in tqdm(words_tfidf)]

### Finding the cosine similarities between documents and labels

* **Labels** : How materials are produced
* **Material** : The material type after producuction


* **SLM** : Selective laser melting
* **DMLS** : Direct metal laser sintering
* **FDM** : Fused deposition modeling 
* **FFF** : Fused filament fabrication
* **EAM** : Extrusion-based additive manufacturing


* Metal
* Polymer 
* Ceramic  

In [5]:
# labels to find for each document
label_1 = get_embedding('selective laser melting')
label_2 = get_embedding('direct metal laser sintering')

label_3 = get_embedding('fused deposition modeling')
label_4 = get_embedding('fused filament fabrication')
label_5 = get_embedding('extrusion based additive manufacturing')

metal = get_embedding('metal')
ceramic = get_embedding('ceramic')
polymer = get_embedding('polymer')

NameError: name 'get_embedding' is not defined

In [None]:
"""
def tfidf_sentences(corpus):
    
    words_tfidf = []

    # Concatenating tf-idf words into sentences
    for text in tqdm(corpus):
        words_tfidf.append(" ".join(find_top_n(text,vectorizer,3)))
        
    # embeddings for concatenated tf-idf sentences    
    sentences = [get_embedding(text) for text in tqdm(words_tfidf)]
    
    return sentences

In [None]:
#new 
sentences = tfidf_sentences(abstract_list,vectorizer)

In [None]:
# tfidf sentences for each document in corpus
words_tfidf = tfidf_sentences(abstract_list,vectorizer)

In [None]:
# embeddings for concatenated tf-idf sentences    
sentences = [get_embedding(text) for text in tqdm(words_tfidf)]

In [None]:
# Labeling for production type
production , production_cos_score = production_labeling(sentences,cos,label_1,label_2,label_3,label_4,label_5)

In [None]:
# Labeling for material type
material , material_cos_score = material_labeling(sentences,cos,metal,ceramic,polymer)

In [None]:
d = {'Documents/Abstracts':abstract_list,'Production':production,'P_score':production_cos_score,'Material':material,'M_score':material_cos_score}
df = pd.DataFrame(d)

In [None]:
df["Production/Material"] = df["Production"] + " / " + df["Material"]

In [None]:
df.head(20)

In [None]:
df["Production"].value_counts()

In [None]:
df["Material"].value_counts()

In [None]:
df[df['M_score']== max(df['M_score'])]

In [None]:
max_material = df.iloc[9955]['Documents/Abstracts']
df.iloc[9955]['Documents/Abstracts']

In [None]:
import re

len(re.findall('ceramic',max_material)) # 7 ceramic in abstract with most score

In [None]:
df[df['P_score']== max(df['P_score'])]

In [None]:
max_production = df.iloc[2686]['Documents/Abstracts']
df.iloc[2686]['Documents/Abstracts']

### 3rd level of classification

* **Features** : Features of materials


* Fracture toughness or Work of fracture
* Tensile strength or ultimate tensile strength
* Yield strength
* Elastic modulus or Young’s modulus
* Strain at break or strain at fracture or fracture strain
* Weibull modulus

In [None]:
feature_1 = get_embedding('fracture toughness')
feature_2 = get_embedding('tensile strength')
feature_3 = get_embedding('yield strength')
feature_4 = get_embedding('elastic modulus')
feature_5 = get_embedding('strain fracture break')
feature_6 = get_embedding('weibull modulus')

In [None]:
# Labeling for feature type

def feature_labeling(sentences):

    feature = []
    feature_cos_score = []
    
    for text in tqdm(sentences):

        lbl= []

        lbl.append(cos(text,feature_1))
        lbl.append(cos(text,feature_2))
        lbl.append(cos(text,feature_3))
        lbl.append(cos(text,feature_4))
        lbl.append(cos(text,feature_5))
        lbl.append(cos(text,feature_6))

        if lbl.index(max(lbl))==0:
            feature.append('Fracture toughness or Work of fracture')
            feature_cos_score.append(max(lbl))
        elif lbl.index(max(lbl))==1:
            feature.append('Tensile strength or ultimate tensile strength')
            feature_cos_score.append(max(lbl))
        elif lbl.index(max(lbl))==2:
            feature.append('Yield strength')
            feature_cos_score.append(max(lbl))
        elif lbl.index(max(lbl))==3:
            feature.append('Elastic modulus or Young’s modulus')
            feature_cos_score.append(max(lbl))
        elif lbl.index(max(lbl))==4:
            feature.append('Strain at break or strain at fracture or fracture strain')
            feature_cos_score.append(max(lbl))
        else:
            feature.append('Weibull modulus')
            feature_cos_score.append(max(lbl))

    # Finding cosine scores
    feature_cos_score = list(map(float,feature_cos_score))
    feature_cos_score = list(map(lambda x: round(x,2), feature_cos_score)) 
    
    return feature , feature_cos_score

In [None]:
feature , feature_cos_score = feature_labeling(sentences,cos,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6)

In [None]:
df['Features'] = feature
df['F_score'] = feature_cos_score

In [None]:
df["Features"].value_counts()

In [None]:
df["Production/Material/Feature"] = df["Production"] + " / " + df["Material"] + " / " + df["Features"]

In [None]:
df.head()

In [None]:
df["Production/Material/Feature"].value_counts()

In [None]:
df["M_score"].value_counts()

In [None]:
df['M_score'].describe()

In [None]:
df["M_score"].nunique()

In [None]:
X_axis = list("0" + 30 * " " + "0.425" + 30 * " "+ "0.85")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as ticker

plt.figure(figsize=(15,10))
sns.set(style="darkgrid")

ax = sns.countplot(x="M_score", data=df)
ax.xaxis.set_major_formatter(ticker.FixedFormatter(X_axis))
#ax.xaxis.set_major_locator(ticker.MultipleLocator(base=10))

In [None]:
df[df['M_score']== min(df['M_score'])]

In [None]:
df["Documents/Abstracts"][df['M_score'].idxmin()] # least cos score

In [None]:
df["Documents/Abstracts"][df['M_score'].idxmax()] # most cos score

In [None]:
df[df['M_score']>0.35]['M_score'].describe()

In [None]:
df.to_csv("data labeling.csv")