In [1]:
import numpy as np
import pandas as pd
import pickle, os
import spacy
import fasttext
import re
import pickle
import string

from tqdm.auto import tqdm, trange
from scipy.spatial.distance import cosine

from utils import text_cleaning, find_top_n, pro_labeling , mat_labeling

from sklearn.feature_extraction.text import TfidfVectorizer

with open('abstract_list.pkl', 'rb') as f:
    abstracts = pickle.load(f)
with open('title_list.pkl', 'rb') as f:
    titles = pickle.load(f)

In [2]:
# Concatenating pdfs and titles for tfidf
pdf_title = abstracts.copy() 
for text in titles:
    pdf_title.append(text)

In [3]:
len(pdf_title)

20000

In [6]:
# Tfidf for abstracts
vec = TfidfVectorizer(max_df=0.9,min_df=10,stop_words='english',ngram_range=(1, 2))
vec.fit(pdf_title)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=None,
                min_df=10, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [7]:
len(vec.vocabulary_)

10908

In [8]:
path = "crawl-300d-2M.vec"

with open('1000.txt','w',encoding="utf-8") as f:
    f.write('\n'.join(pdf_title[:1000]))

train_data = '1000.txt'

ft_model = fasttext.train_unsupervised(input=train_data, pretrainedVectors=path, dim=300)

In [None]:
# Saving the model
#ft_model.save_model("ft_model_arxiv.bin")

# Loading the model
#ft_model = fasttext.load_model("ft_model_arxiv.bin")

In [9]:
# Creating sentences for each document with the strongest tfidf words

tfidf_words_abstract = []

for abstract in tqdm(abstracts):
    tfidf_words_abstract.append(" ".join(find_top_n(abstract,vec,10)))
    
tfidf_words_title = []

for title in tqdm(titles):
    tfidf_words_title.append(" ".join(find_top_n(title,vec,10)))

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [None]:
len(abstracts)

In [None]:
len(titles)

In [None]:
######################

In [None]:
tfidf_words_abstract[:5]

In [None]:
tfidf_words_title[:5]

In [None]:
list(filter(lambda x:len(x.split())==2,list(np.array(vec.get_feature_names()))))

In [10]:
pro_1 = ft_model.get_sentence_vector('selective laser melting')
pro_2 = ft_model.get_sentence_vector('direct metal laser sintering')

pro_3 = ft_model.get_sentence_vector('fused deposition modeling')
pro_4 = ft_model.get_sentence_vector('fused filament fabrication')
pro_5 = ft_model.get_sentence_vector('extrusion based additive manufacturing')

metal = ft_model.get_sentence_vector('metal')
ceramic = ft_model.get_sentence_vector('ceramic')
polymer = ft_model.get_sentence_vector('polymer')

# feature_1 = get_embedding('fracture toughness')
# feature_2 = get_embedding('tensile strength')
# feature_3 = get_embedding('yield strength')
# feature_4 = get_embedding('elastic modulus')
# feature_5 = get_embedding('strain fracture break')
# feature_6 = get_embedding('weibull modulus')

In [11]:
def create_df_abs():
    
    d_abs = {'Titles':titles,
         'Abstracts':abstracts,
         'Abs_Production':abs_production,
         'Abs_Production_score':abs_production_cos_score,
         'Abs_Material':abs_material,
         'Abs_Material_score':abs_material_cos_score
        }
    
    df = pd.DataFrame(d_abs)
    #df["Production/Material"] = df["Abs_Production"] + " / " + df["Abs_Material"]
    
    return df

In [12]:
def create_df_title():
    
    d_title = {'Titles':titles,
         'Abstracts':abstracts,
         'Title_Production':title_production,
         'Title_Production_score':title_production_cos_score,
         'Title_Material':title_material,
         'Title_Material_score':title_material_cos_score
        }
    
    df = pd.DataFrame(d_title)
    #df["Production/Material"] = df["Title_Production"] + " / " + df["Title_Material"]
    
    return df

In [13]:
abs_production , abs_production_cos_score = pro_labeling(tfidf_words_abstract,ft_model,cosine,pro_1,pro_2,pro_3,pro_4,pro_5)
abs_material , abs_material_cos_score = mat_labeling(tfidf_words_abstract,ft_model,cosine,metal,ceramic,polymer)
df_abs = create_df_abs()

title_production , title_production_cos_score = pro_labeling(tfidf_words_title,ft_model,cosine,pro_1,pro_2,pro_3,pro_4,pro_5)
title_material , title_material_cos_score = mat_labeling(tfidf_words_title,ft_model,cosine,metal,ceramic,polymer)
df_title = create_df_title()

In [14]:
df_abs["Abs_Production"].value_counts()

FDM or FFF or EAM    9905
SLM or DMLS            95
Name: Abs_Production, dtype: int64

In [15]:
df_title["Title_Production"].value_counts()

FDM or FFF or EAM    9766
SLM or DMLS           234
Name: Title_Production, dtype: int64

In [16]:
df_abs["Abs_Material"].value_counts()

Ceramic    6051
Metal      3252
Polymer     697
Name: Abs_Material, dtype: int64

In [17]:
df_title["Title_Material"].value_counts()

Ceramic    6027
Metal      3289
Polymer     684
Name: Title_Material, dtype: int64

In [None]:
vectorizer = TfidfVectorizer(max_df=0.9,min_df=2,stop_words='english')
vectorizer.fit(abstract_list)

In [None]:
cos = torch.nn.CosineSimilarity(dim=0, eps=1e-6)

### Finding the cosine similarities between documents and labels

* **Labels** : How materials are produced
* **Material** : The material type after producuction


* **SLM** : Selective laser melting
* **DMLS** : Direct metal laser sintering
* **FDM** : Fused deposition modeling 
* **FFF** : Fused filament fabrication
* **EAM** : Extrusion-based additive manufacturing


* Metal
* Polymer 
* Ceramic  

In [None]:
# labels to find for each document
label_1 = get_embedding('selective laser melting')
label_2 = get_embedding('direct metal laser sintering')

label_3 = get_embedding('fused deposition modeling')
label_4 = get_embedding('fused filament fabrication')
label_5 = get_embedding('extrusion based additive manufacturing')

metal = get_embedding('metal')
ceramic = get_embedding('ceramic')
polymer = get_embedding('polymer')

feature_1 = get_embedding('fracture toughness')
feature_2 = get_embedding('tensile strength')
feature_3 = get_embedding('yield strength')
feature_4 = get_embedding('elastic modulus')
feature_5 = get_embedding('strain fracture break')
feature_6 = get_embedding('weibull modulus')

In [None]:
#new 
sentences = tfidf_sentences(abstract_list,vectorizer,3)

In [None]:
# Labeling for production type
production , production_cos_score = production_labeling(sentences,cos,label_1,label_2,label_3,label_4,label_5)

In [None]:
# Labeling for material type
material , material_cos_score = material_labeling(sentences,cos,metal,ceramic,polymer)

In [None]:
d = {'Documents/Abstracts':abstract_list,'Production':production,'P_score':production_cos_score,'Material':material,'M_score':material_cos_score}
df = pd.DataFrame(d)

In [None]:
df["Production/Material"] = df["Production"] + " / " + df["Material"]

In [None]:
df.head(20)

In [None]:
df["Production"].value_counts()

In [None]:
df["Material"].value_counts()

In [None]:
df[df['M_score']== max(df['M_score'])]

In [None]:
max_material = df.iloc[9955]['Documents/Abstracts']
df.iloc[9955]['Documents/Abstracts']

In [None]:
import re

len(re.findall('ceramic',max_material)) # 7 ceramic in abstract with most score

In [None]:
df[df['P_score']== max(df['P_score'])]

In [None]:
max_production = df.iloc[2686]['Documents/Abstracts']
df.iloc[2686]['Documents/Abstracts']

### 3rd level of classification

* **Features** : Features of materials


* Fracture toughness or Work of fracture
* Tensile strength or ultimate tensile strength
* Yield strength
* Elastic modulus or Young’s modulus
* Strain at break or strain at fracture or fracture strain
* Weibull modulus

In [None]:
# Labeling for feature type
feature , feature_cos_score = feature_labeling(sentences,cos,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6)

In [None]:
df['Features'] = feature
df['F_score'] = feature_cos_score

In [None]:
df["Features"].value_counts()

In [None]:
df["Production/Material/Feature"] = df["Production"] + " / " + df["Material"] + " / " + df["Features"]

In [None]:
df.head()

In [None]:
df["Production/Material/Feature"].value_counts()

In [None]:
df["M_score"].value_counts()

In [None]:
df['M_score'].describe()

In [None]:
df["M_score"].nunique()

In [None]:
X_axis = list("0" + 30 * " " + "0.425" + 30 * " "+ "0.85")

In [None]:
plt.figure(figsize=(15,10))
sns.set(style="darkgrid")

ax = sns.countplot(x="M_score", data=df)
ax.xaxis.set_major_formatter(ticker.FixedFormatter(X_axis))
#ax.xaxis.set_major_locator(ticker.MultipleLocator(base=10))

In [None]:
df[df['M_score']== min(df['M_score'])]

In [None]:
df["Documents/Abstracts"][df['M_score'].idxmin()] # least cos score

In [None]:
df["Documents/Abstracts"][df['M_score'].idxmax()] # most cos score

In [None]:
df[df['M_score']>0.35]['M_score'].describe()

In [None]:
#df.to_csv("data labeling.csv")

In [None]:
"""
Queries arxiv API and parses the urls of papers.
Reads all papers under Material Science tag and fetch their abstracts and titles
Dumps results to files abstract.pkl and titles.pkl
"""
import urllib.request
import feedparser
import pickle
import spacy
import re



# Loading spacy nlp model
spacy_model = spacy.load("en_core_web_sm")

# Base api query url
base_url = 'http://export.arxiv.org/api/query?';

# Search parameters- Limits the number of papers accessed
search_query = 'cat:cond-mat.mtrl-sci' 
start = 0                     
max_results = 10000

query = 'search_query=%s&start=%i&max_results=%i' % (search_query,
                                                     start,
                                                     max_results)

feedparser._FeedParserMixin.namespaces['http://arxiv.org/schemas/atom'] = 'arxiv'

# perform a GET request using the base_url and query
response = urllib.request.urlopen(base_url+query).read()

# parse the response using feedparser
feed = feedparser.parse(response)

# print out feed information
print ('Feed title: %s' % feed.feed.title)

# print opensearch metadata
print ('totalResults for this query: %s' % feed.feed.opensearch_totalresults)
print ('itemsPerPage for this query: %s' % feed.feed.opensearch_itemsperpage)
print ('startIndex for this query: %s'   % feed.feed.opensearch_startindex)

# Run through each entry, and print out information
title_list = []
abstract_list = []

for entry in feed.entries:
   
    title_list.append(entry.title)
    print ('Title:  %s' % entry.title)

    # get the links to the abs page and pdf 
    for link in entry.links:
        if link.rel == 'alternate':
            print ('abs page link: %s' % link.href)
        elif link.title == 'pdf':
            print ('pdf link: %s' % link.href)
    

    #print ('Abstract: %s' %  entry.summary)
    abstract_list.append(entry.summary)

In [None]:
type(abstract_list[0])

In [None]:
abstracts = list(map(lambda x: text_cleaning(x,spacy_model), abstract_list))
titles = list(map(lambda x: text_cleaning(x,spacy_model), title_list))


with open('abstract_list.pkl', 'wb') as f:
    pickle.dump(abstracts, f)
with open('title_list.pkl', 'wb') as f:
    pickle.dump(titles, f)

In [None]:
abstract_list[33]

In [None]:
abstracts[33]

In [None]:
data = " symmetry mm mmm "
data = re.sub(r" mm ", " ", data)
data