In [1]:
import numpy as np
import pandas as pd
import pickle, os
import spacy
import fasttext
import re

from tqdm.auto import tqdm, trange
from scipy.spatial.distance import cosine

from utils import find_top_n, pro_labeling ,mat_labeling, text_cleaning


from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
import seaborn as sns
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

### Preprocessing

In [3]:
# Loading spacy nlp model
spacy_model = spacy.load("en_core_web_sm")

# Getting the titles from file names
title_list = []

for file in os.listdir("Local pdf text files"):
  if file.endswith(".txt"):
    title_list.append(file.split(".")[0])
    
# Getting the text from local text files
pdf_list = []
    
for file in os.listdir("Local pdf text files"):
  if file.endswith(".txt"):
    with open("Local pdf text files/"+file,"r",encoding="utf8") as f:
        pdf_list.append(f.read())

pdfs = list(map(lambda x: text_cleaning(x,spacy_model), pdf_list))
titles = list(map(lambda x: text_cleaning(x,spacy_model), title_list))

In [4]:
# Concatenating pdfs and titles for tfidf
pdf_title = pdfs.copy() 
for text in titles:
    pdf_title.append(text)

In [5]:
len(pdf_title)

198

In [6]:
vec = TfidfVectorizer(max_df=0.9,min_df=2,stop_words='english',ngram_range=(1, 2))
X = vec.fit_transform(pdf_title)

In [23]:
vec.vocabulary_['ceramic']

203

In [7]:
len(vec.vocabulary_)

1764

In [8]:
np.array(vec.get_feature_names())

array(['ability', 'ability fabricate', 'able', ..., 'zirconia',
       'zirconia ceramic', 'zone'], dtype='<U27')

In [9]:
X[0].toarray()

array([[0.        , 0.        , 0.        , ..., 0.11914613, 0.1283676 ,
        0.        ]])

In [10]:
X[0].toarray().shape

(1, 1764)

In [30]:
ceramic = vec.transform(["ceramic"]).toarray()
polymer = vec.transform(["polymer"]).toarray()
metal = vec.transform(["metal"]).toarray()

In [26]:
ceramic[0][203]

1.0

In [49]:
first = vec.transform([pdf_title[2]]).toarray()

In [50]:
1-cosine(first,metal)

0.06758889471103946

In [51]:
1-cosine(first,ceramic)

0.3166400912867242

In [52]:
1-cosine(first,polymer)

0.07952695025708878

In [54]:
pdf_title[1]

'be impossible to print glass directly from melt layer by layer glass be not only very sensitive to temperature gradient between different layer but also to the cool process to achieve glass state the melt have to be cool rapidly to avoid crystallization of the material and then anneal to remove cool induced stress in printing of glass the object be shape at room temperature and then fire the material property of the final object be crucially dependent on the frit size of the glass powder use during shape the chemical formula of the binder and the firing procedure for frit size below seem to find constant volume of pore of less than decrease frit size lead to an increase in the number of pore which then lead to an increase of opacity the two different binder hydroxyethyl cellulose and carboxymethylcellulose sodium salt generate very different porosity the porosity of sample with hydroxyethyl cellulose be similar to frit only sample whereas carboxymethylcellulose sodium salt create glas

In [37]:
def test(text):
    
    tfidf = vec.transform([text]).toarray()
    
    lbl= []

    lbl.append(1-cosine(tfidf,metal))
    lbl.append(1-cosine(tfidf,ceramic))
    lbl.append(1-cosine(tfidf,polymer))
    
    if lbl.index(max(lbl))==0:
        return 'Metal',max(lbl)
    elif lbl.index(max(lbl))==1:
        return 'Ceramic',max(lbl)
    else:
        return 'Polymer',max(lbl)

In [43]:
for text in pdf_title[:20]:
    test(text)

('Ceramic', 0.14853056644414775)

In [44]:
test(pdf_title[20])

('Polymer', 0.06446962811050072)