In [1]:
import pandas as pd
import numpy as numpy
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

# Loading lists from pickle files
with open('abstract_list.pkl', 'rb') as f:
    abstract_list = pickle.load(f)

with open('title_list.pkl', 'rb') as f:
    title_list = pickle.load(f)

In [2]:
df = pd.DataFrame(data=abstract_list,columns=['Documents'])
df.head()

Unnamed: 0,Documents
0,we study the electronic states of giant single...
1,the recursion and pathintegral methods are app...
2,we analytically study phonon transmission and ...
3,we study both analytically and numerically pho...
4,we present a model for thin film growth by par...


In [3]:
# Training tf-idf vectorizer on abstract corpus

tfidf = TfidfVectorizer(max_df=0.95,min_df=2,stop_words='english')
dtm = tfidf.fit_transform(df['Documents'])

In [4]:
# Non-negative matrix factorization on tf-idf matrix

nmf_model = NMF(n_components=5,random_state=100)
nmf_model.fit(dtm)

NMF(alpha=0.0, beta_loss='frobenius', init=None, l1_ratio=0.0, max_iter=200,
    n_components=5, random_state=100, shuffle=False, solver='cd', tol=0.0001,
    verbose=0)

In [5]:
# Printing top 5 words for each topic

for i,topic in enumerate(nmf_model.components_):
    print(f'THE TOP 5 WORDS FOR TOPIC #{i}')
    print([tfidf.get_feature_names()[index] for index in topic.argsort()[-5:]])
    print('\n')
    print('\n')

THE TOP 5 WORDS FOR TOPIC #0
['structure', 'density', 'calculations', 'energy', 'surface']




THE TOP 5 WORDS FOR TOPIC #1
['field', 'magnetization', 'ferromagnetic', 'mn', 'magnetic']




THE TOP 5 WORDS FOR TOPIC #2
['ferroelectric', 'transition', 'films', 'temperature', 'phase']




THE TOP 5 WORDS FOR TOPIC #3
['substrate', 'raman', 'carbon', 'layer', 'graphene']




THE TOP 5 WORDS FOR TOPIC #4
['field', 'relaxation', 'polarization', 'current', 'spin']






In [6]:
topic_results = nmf_model.transform(dtm)
df['labels'] = topic_results.argmax(axis=1)
df.head()

Unnamed: 0,Documents,labels
0,we study the electronic states of giant single...,0
1,the recursion and pathintegral methods are app...,0
2,we analytically study phonon transmission and ...,0
3,we study both analytically and numerically pho...,2
4,we present a model for thin film growth by par...,2


In [7]:
df['labels'].value_counts()

0    3754
2    2879
1    1666
4    1108
3     593
Name: labels, dtype: int64