In [1]:
import numpy as np
import pandas as pd
import pickle, os

import re
import string

from tqdm.auto import tqdm, trange
from scipy.spatial.distance import cosine

from utils import find_top_n, pro_labeling ,mat_labeling
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import fasttext
from gensim.models import FastText

import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer

stemmer = WordNetLemmatizer()
en_stop = set(nltk.corpus.stopwords.words('english'))

In [3]:
with open('abstract_list.pkl', 'rb') as f:
    abstract_old = pickle.load(f)
with open('title_list.pkl', 'rb') as f:
    title_old = pickle.load(f)

In [4]:
with open('abstract_40k.pkl', 'rb') as f:
    abstracts_40 = pickle.load(f)
with open('title_40k.pkl', 'rb') as f:
    titles_40 = pickle.load(f)

with open('abstract_40ve50.pkl', 'rb') as f:
    abstracts_50 = pickle.load(f)
with open('title_40ve50.pkl', 'rb') as f:
    titles_50 = pickle.load(f)

abstracts_50k = abstracts_40 + abstracts_50
titles_50k = titles_40 + titles_50

len(abstracts_50k)

50000

In [5]:
def text_cleaning(data):

    data = data.lower()
    data = re.sub('\$(.*?)\$',' ',data)
    data = re.sub('\[*?\]', ' ', data)
    data = re.sub(f'[{re.escape(string.punctuation)}]', ' ', data)
    data = re.sub('\w*\d\w*', ' ', data)
    data = data.replace("\n"," ")

    
    data = re.sub('[^a-zA-Z0-9 -]','',data)
    data = re.sub(r"\b[a-zA-Z]\b", "", data)
    data = re.sub(r" mm ", " ", data)

    data = data.split()
    tokens = [stemmer.lemmatize(word) for word in data]
    tokens = [word for word in tokens if word not in en_stop]
    #tokens = [word for word in tokens if len(word) > 3]
    return tokens

In [6]:
abstracts_50k[0]

'We study the electronic states of giant single-shell and the recently\ndiscovered nested multi-shell carbon fullerenes within the tight-binding\napproximation. We use two different approaches, one based on iterations and the\nother on symmetry, to obtain the $\\pi$-state energy spectra of large fullerene\ncages: $C_{240}$, $C_{540}$, $C_{960}$, $C_{1500}$, $C_{2160}$ and $C_{2940}$.\nOur iteration technique reduces the dimensionality of the problem by more than\none order of magnitude (factors of $\\sim 12$ and $20$), while the\nsymmetry-based approach reduces it by a factor of $10$. We also find formulae\nfor the highest occupied and lowest unoccupied molecular orbital (HOMO and\nLUMO) energies of $C_{60{\\cdot}n^{2}}$ fullerenes as a function of $n$,\ndemonstrating a tendency towards metallic regime for increasing $n$. For\nmulti-shell fullerenes, we analytically obtain the eigenvalues of the\nintershell interaction.'

In [7]:
sent_tokenize(abstracts_50k[0])

['We study the electronic states of giant single-shell and the recently\ndiscovered nested multi-shell carbon fullerenes within the tight-binding\napproximation.',
 'We use two different approaches, one based on iterations and the\nother on symmetry, to obtain the $\\pi$-state energy spectra of large fullerene\ncages: $C_{240}$, $C_{540}$, $C_{960}$, $C_{1500}$, $C_{2160}$ and $C_{2940}$.',
 'Our iteration technique reduces the dimensionality of the problem by more than\none order of magnitude (factors of $\\sim 12$ and $20$), while the\nsymmetry-based approach reduces it by a factor of $10$.',
 'We also find formulae\nfor the highest occupied and lowest unoccupied molecular orbital (HOMO and\nLUMO) energies of $C_{60{\\cdot}n^{2}}$ fullerenes as a function of $n$,\ndemonstrating a tendency towards metallic regime for increasing $n$.',
 'For\nmulti-shell fullerenes, we analytically obtain the eigenvalues of the\nintershell interaction.']

In [None]:
# [text_cleaning(sentence) for sentence in sent_tokenize(abstracts_50k[0]) if sentence.strip() !='']

In [8]:
corpus = abstracts_50k + titles_50k

In [9]:
len(corpus)

100000

In [10]:
corpus = [sentences for text in corpus for sentences in sent_tokenize(text)]

In [11]:
len(corpus)

331319

In [83]:
corpus[0]

'We study the electronic states of giant single-shell and the recently\ndiscovered nested multi-shell carbon fullerenes within the tight-binding\napproximation.'

In [13]:
final_corpus = [text_cleaning(sentences) for sentences in corpus if sentences.strip() !='']

In [84]:
final_corpus[0]

['study',
 'electronic',
 'state',
 'giant',
 'single',
 'shell',
 'recently',
 'discovered',
 'nested',
 'multi',
 'shell',
 'carbon',
 'fullerene',
 'within',
 'tight',
 'binding',
 'approximation']

In [None]:
# abstracts = list(map(lambda x: text_cleaning(x), abstracts_50k))
# titles = list(map(lambda x: text_cleaning(x), titles_50k))

In [None]:
# # Getting the titles from file names
# title_list = []

# for file in os.listdir("Local pdf text files"):
#   if file.endswith(".txt"):
#     title_list.append(file.split(".")[0])
    
# # Getting the text from local text files
# pdf_list = []
    
# for file in os.listdir("Local pdf text files"):
#   if file.endswith(".txt"):
#     with open("Local pdf text files/"+file,"r",encoding="utf8") as f:
#         pdf_list.append(f.read())

In [None]:
# pdfs = list(map(lambda x: text_cleaning(x), pdf_list))
# titles = list(map(lambda x: text_cleaning(x), title_list))

In [None]:
# pdfs[1]

In [None]:
# ft_model = FastText(abstracts, size=100, window=5, min_count=5, workers=4,sg=1)

In [None]:
# %%time
# ft_model = FastText(abstracts, size=100, window=40, min_count=5, sample=1e-2, sg=1,iter=100)

In [15]:
%%time
ft_model = FastText(final_corpus, size=100, window=10, min_count=5,workers=4, sg=1)

Wall time: 1min 53s


In [16]:
ft_model.save("gensim_model_new_2.bin")
# ft_model = FastText.load('gensim_model_new.bin')

In [17]:
ft_model.wv.most_similar("metal") 

[('nonmetal', 0.7858569622039795),
 ('metalization', 0.721686065196991),
 ('metallisation', 0.7186127305030823),
 ('metalic', 0.7125375270843506),
 ('metallo', 0.7095639109611511),
 ('atypical', 0.7062879800796509),
 ('nonmetallic', 0.6977999210357666),
 ('petal', 0.6844149827957153),
 ('transtion', 0.6705237627029419),
 ('noble', 0.6630264520645142)]

In [81]:
ft_model.wv.most_similar("polymer")

[('biopolymer', 0.9137256741523743),
 ('polymersomes', 0.8813596963882446),
 ('homopolymer', 0.8753225803375244),
 ('biopolymers', 0.8731505870819092),
 ('polymeric', 0.8603378534317017),
 ('azopolymers', 0.8601329922676086),
 ('tribopolymer', 0.8541461229324341),
 ('copolymer', 0.8442385196685791),
 ('fluoropolymer', 0.8402372598648071),
 ('polymethyl', 0.8362335562705994)]

In [82]:
ft_model.wv.most_similar("ceramic")

[('ceram', 0.8762143850326538),
 ('piezoceramic', 0.8336099982261658),
 ('ccto', 0.7820297479629517),
 ('piezoceramics', 0.7733694911003113),
 ('bczt', 0.7497743368148804),
 ('nzfo', 0.7245482802391052),
 ('pztfw', 0.7197600603103638),
 ('nbt', 0.7049596905708313),
 ('plzt', 0.7035001516342163),
 ('xlax', 0.6985239386558533)]

In [21]:
" ".join(final_corpus[0])

'study electronic state giant single shell recently discovered nested multi shell carbon fullerene within tight binding approximation'

In [22]:
ft_model.wv.similarity(" ".join(final_corpus[0]),"metal")

0.62559295

In [23]:
ft_model.wv.similarity(" ".join(final_corpus[0]),"ceramic")

0.34308237

In [24]:
ft_model.wv.similarity(" ".join(final_corpus[0]),"polymer")

0.46300992

In [25]:
pro_1 = ft_model.wv.get_vector('selective laser melting')
pro_2 = ft_model.wv.get_vector('direct metal laser sintering')

pro_3 = ft_model.wv.get_vector('fused deposition modeling')
pro_4 = ft_model.wv.get_vector('fused filament fabrication')
pro_5 = ft_model.wv.get_vector('extrusion based additive manufacturing')

metal = ft_model.wv.get_vector('metal')
ceramic = ft_model.wv.get_vector('ceramic')
polymer = ft_model.wv.get_vector('polymer')

In [74]:
def create_df_abs():
    
    d_abs = {'Titles':title_old,
         'Abstracts':abstract_old,
         'Abs_Production':abs_production,
         'Abs_Production_score':abs_production_cos_score,
         'Abs_Material':abs_material,
         'Abs_Material_score':abs_material_cos_score
        }
    
    df = pd.DataFrame(d_abs)
    #df["Production/Material"] = df["Abs_Production"] + " / " + df["Abs_Material"]
    
    return df

In [75]:
def create_df_title():
    
    d_title = {'Titles':title_old, #title_old
         'Abstracts':abstract_old, #abstract_old
         'Title_Production':title_production,
         'Title_Production_score':title_production_cos_score,
         'Title_Material':title_material,
         'Title_Material_score':title_material_cos_score
        }
    
    df = pd.DataFrame(d_title)
    #df["Production/Material"] = df["Title_Production"] + " / " + df["Title_Material"]
    
    return df

In [28]:
def pro_labeling(doc,ft_model,cosine,pro_1,pro_2,pro_3,pro_4,pro_5):
    
    production = []
    production_cos_score = []
    
    for text in doc:
        
        text = ft_model.wv.get_vector(text)
        
        lbl= []

        lbl.append(1-cosine(text,pro_1))
        lbl.append(1-cosine(text,pro_2))
        lbl.append(1-cosine(text,pro_3))
        lbl.append(1-cosine(text,pro_4))
        lbl.append(1-cosine(text,pro_5))

        if lbl.index(max(lbl))==0 or lbl.index(max(lbl))==1:
            production.append('SLM or DMLS')
            production_cos_score.append(max(lbl))
        else:
            production.append('FDM or FFF or EAM')
            production_cos_score.append(max(lbl))
    
    return production , production_cos_score

In [29]:
def mat_labeling(doc,ft_model,cosine,metal,ceramic,polymer):

    material = []  
    material_cos_score = []

    for text in doc:
        
        text = ft_model.wv.get_vector(text)
        
        lbl= []

        lbl.append(1-cosine(text,metal))
        lbl.append(1-cosine(text,ceramic))
        lbl.append(1-cosine(text,polymer))

        if lbl.index(max(lbl))==0:
            material.append('Metal')
            material_cos_score.append(max(lbl))
        elif lbl.index(max(lbl))==1:
            material.append('Ceramic')
            material_cos_score.append(max(lbl))
        else:
            material.append('Polymer')
            material_cos_score.append(max(lbl))
    
    return material , material_cos_score

In [44]:
abstracts = list(map(lambda x: text_cleaning(x), abstracts_50k))
titles = list(map(lambda x: text_cleaning(x), titles_50k))

In [45]:
abstracts

[['study',
  'electronic',
  'state',
  'giant',
  'single',
  'shell',
  'recently',
  'discovered',
  'nested',
  'multi',
  'shell',
  'carbon',
  'fullerene',
  'within',
  'tight',
  'binding',
  'approximation',
  'use',
  'two',
  'different',
  'approach',
  'one',
  'based',
  'iteration',
  'symmetry',
  'obtain',
  'state',
  'energy',
  'spectrum',
  'large',
  'fullerene',
  'cage',
  'iteration',
  'technique',
  'reduces',
  'dimensionality',
  'problem',
  'one',
  'order',
  'magnitude',
  'factor',
  'symmetry',
  'based',
  'approach',
  'reduces',
  'factor',
  'also',
  'find',
  'formula',
  'highest',
  'occupied',
  'lowest',
  'unoccupied',
  'molecular',
  'orbital',
  'homo',
  'lumo',
  'energy',
  'fullerene',
  'function',
  'demonstrating',
  'tendency',
  'towards',
  'metallic',
  'regime',
  'increasing',
  'multi',
  'shell',
  'fullerene',
  'analytically',
  'obtain',
  'eigenvalue',
  'intershell',
  'interaction'],
 ['recursion',
  'path',
  'inte

In [46]:
abstracts_sen = list(map(lambda x: " ".join(x), abstracts))
titles_sen = list(map(lambda x: " ".join(x), titles))

In [85]:
abstracts_sen[0]

'study electronic state giant single shell recently discovered nested multi shell carbon fullerene within tight binding approximation use two different approach one based iteration symmetry obtain state energy spectrum large fullerene cage iteration technique reduces dimensionality problem one order magnitude factor symmetry based approach reduces factor also find formula highest occupied lowest unoccupied molecular orbital homo lumo energy fullerene function demonstrating tendency towards metallic regime increasing multi shell fullerene analytically obtain eigenvalue intershell interaction'

In [57]:
abs_production , abs_production_cos_score = pro_labeling(abstracts_sen,ft_model,cosine,pro_1,pro_2,pro_3,pro_4,pro_5)
abs_material , abs_material_cos_score = mat_labeling(abstracts_sen,ft_model,cosine,metal,ceramic,polymer)
df_abs = create_df_abs()

title_production , title_production_cos_score = pro_labeling(titles_sen,ft_model,cosine,pro_1,pro_2,pro_3,pro_4,pro_5)
title_material , title_material_cos_score = mat_labeling(titles_sen,ft_model,cosine,metal,ceramic,polymer)
df_title = create_df_title()

In [58]:
df_abs["Abs_Production"].value_counts()

SLM or DMLS          42772
FDM or FFF or EAM     7228
Name: Abs_Production, dtype: int64

In [59]:
df_title["Title_Production"].value_counts()

SLM or DMLS          38150
FDM or FFF or EAM    11850
Name: Title_Production, dtype: int64

In [60]:
df_abs["Abs_Material"].value_counts()

Metal      47149
Ceramic     1570
Polymer     1281
Name: Abs_Material, dtype: int64

In [61]:
df_title["Title_Material"].value_counts()

Metal      39980
Ceramic     5768
Polymer     4252
Name: Title_Material, dtype: int64

In [62]:
import seaborn as sns
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [None]:
def visu_pro(doc):
    
    X= []

    for text in doc:
        X.append(ft_model.wv.get_vector(text))

    pca = PCA(n_components=2)
    X_reduced = pca.fit_transform(X)

    pca_df = pd.DataFrame(index=title_old[:10000],columns=['First Component','Second Component'],data=X_reduced)
    
    if doc == abstracts_sen[:10000]:
        pca_df["Labels"] = df_abs["Abs_Production"].values
    else :
        pca_df["Labels"] = df_title["Title_Production"].values

    fig, ax = plt.subplots(figsize=(10,10))
    ax = sns.scatterplot(x=pca_df['First Component'],y=pca_df['Second Component'],hue=pca_df['Labels'],palette ='Set1')
    
    return pca_df

In [None]:
a_pro = visu_pro(abstracts_sen[:10000])

In [None]:
t_pro = visu_pro(titles_sen[:10000])

In [None]:
def visu_mat(doc):
    
    X= []

    for text in doc:
        X.append(ft_model.wv.get_vector(text))

    pca = PCA(n_components=2)
    X_reduced = pca.fit_transform(X)

    pca_df = pd.DataFrame(index=title_old[:10000],columns=['First Component','Second Component'],data=X_reduced)
    
    if doc == abstracts_sen[:10000]:
        pca_df["Labels"] = df_abs["Abs_Material"].values
    else :
        pca_df["Labels"] = df_title["Title_Material"].values

    fig, ax = plt.subplots(figsize=(10,10))
    ax = sns.scatterplot(x=pca_df['First Component'],y=pca_df['Second Component'],hue=pca_df['Labels'],palette ='Set1')
    
    return pca_df

In [None]:
a_mat = visu_mat(abstracts_sen[:10000])

In [None]:
t_mat = visu_mat(titles_sen[:10000])

In [None]:
df_abs.Abstracts[17]

In [None]:
df_abs.head(20) #new

In [None]:
df_abs.head()

In [None]:
df_title.head(20) #new

In [None]:
df_title.head()

# tfidf

In [65]:
len(abstracts_sen)

50000

In [70]:
# Tfidf for abstracts
vec = TfidfVectorizer(max_df=0.9,min_df=10,ngram_range=(1, 2),stop_words='english')
vec.fit(abstracts_sen[:10000])

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.9, max_features=None,
                min_df=10, ngram_range=(1, 2), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [71]:
len(vec.vocabulary_)

9692

In [72]:
# Creating sentences for each document with the strongest tfidf words

tfidf_words_abstract = []

for abstract in tqdm(abstracts_sen[:10000]):
    tfidf_words_abstract.append(" ".join(find_top_n(abstract,vec,10)))
    
tfidf_words_title = []

for title in tqdm(titles_sen[:10000]):
    tfidf_words_title.append(" ".join(find_top_n(title,vec,5)))

HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




HBox(children=(IntProgress(value=0, max=10000), HTML(value='')))




In [76]:
abs_production , abs_production_cos_score = pro_labeling(tfidf_words_abstract,ft_model,cosine,pro_1,pro_2,pro_3,pro_4,pro_5)
abs_material , abs_material_cos_score = mat_labeling(tfidf_words_abstract,ft_model,cosine,metal,ceramic,polymer)
df_abs = create_df_abs()

title_production , title_production_cos_score = pro_labeling(tfidf_words_title,ft_model,cosine,pro_1,pro_2,pro_3,pro_4,pro_5)
title_material , title_material_cos_score = mat_labeling(tfidf_words_title,ft_model,cosine,metal,ceramic,polymer)
df_title = create_df_title()

In [77]:
df_abs["Abs_Production"].value_counts()

SLM or DMLS          7535
FDM or FFF or EAM    2465
Name: Abs_Production, dtype: int64

In [78]:
df_title["Title_Production"].value_counts()

SLM or DMLS          7293
FDM or FFF or EAM    2707
Name: Title_Production, dtype: int64

In [79]:
df_abs["Abs_Material"].value_counts()

Metal      8052
Ceramic    1202
Polymer     746
Name: Abs_Material, dtype: int64

In [80]:
df_title["Title_Material"].value_counts()

Metal      7539
Ceramic    1523
Polymer     938
Name: Title_Material, dtype: int64

In [None]:
def visu_mat(doc):
    
    X= []

    for text in doc:
        X.append(ft_model.wv.get_vector(text))

    pca = PCA(n_components=2)
    X_reduced = pca.fit_transform(X)

    pca_df = pd.DataFrame(index=title_old[:10000],columns=['First Component','Second Component'],data=X_reduced)
    
    if doc == tfidf_words_abstract:
        pca_df["Labels"] = df_abs["Abs_Material"].values
    else :
        pca_df["Labels"] = df_title["Title_Material"].values

    fig, ax = plt.subplots(figsize=(10,10))
    ax = sns.scatterplot(x=pca_df['First Component'],y=pca_df['Second Component'],hue=pca_df['Labels'],palette ='Set1')
    
    return pca_df

In [None]:
a_mat = visu_mat(tfidf_words_abstract)

In [None]:
t_mat = visu_mat(tfidf_words_title)

In [None]:
tfidf_words_abstract[0]

In [None]:
ft_model.wv.similarity(tfidf_words_abstract[0],"metal") #0.424

In [None]:
ft_model.wv.similarity(tfidf_words_abstract[0],"ceramic") #0.312

In [None]:
ft_model.wv.similarity(tfidf_words_abstract[0],"polymer")#0.422

In [None]:
ft_model.wv.similarity(tfidf_words_title[0],"metal") 

In [None]:
ft_model.wv.similarity(tfidf_words_title[0],"ceramic") 

In [None]:
ft_model.wv.similarity(tfidf_words_title[0],"polymer") #0.424

In [None]:
tfidf_words_title[0]

In [None]:
a = ft_model.wv.get_vector(tfidf_words_title[0])
b = ft_model.wv.get_vector('metal')
1-cosine(a,b)