# Topic Modeling using sklearn

In [3]:
# Source: http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from topic_model import get_corpus, vectorize_corpus
from time import time
import pandas as pd

In [2]:
corpus = get_corpus('../data/pge_database.json')
vectorizer, data_vectorized = vectorize_corpus(corpus, tf_idf=False, stem_lem='lem', ngram_range=(1,1),
                                    max_df=0.8, min_df=5, max_features=None)

0 faculties have missing papers in ../data/pge_database.json
Running nlp-pipeline on faculties with non-missing papers...


In [3]:
NUM_TOPICS = 12

# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
 
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)

# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)

In [4]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('fracture', 6.7153137609373204), ('permeability', 4.0932162327593637), ('drilling', 3.4545693312910117), ('recovery', 3.1793257561466328), ('pore', 3.1051685785184358), ('wells', 3.0661453406493031), ('fractures', 3.0107315510561921), ('foam', 2.8910909084000793), ('wellbore', 2.8322879954599474), ('injection', 2.7901308147195154)]
Topic 1:
[('vibrations', 0.34589383127904855), ('blocks', 0.3435578246546574), ('entrance', 0.3435521342626297), ('allowing', 0.3340677682738275), ('noted', 0.33099649622087596), ('inverted', 0.32848968083888885), ('differences', 0.32789144582529256), ('takes', 0.32551033421210424), ('suggestions', 0.32543373889946053), ('mitigating', 0.32475624971327888)]
Topic 2:
[('battery', 0.47977405891154123), ('multiscale', 0.46459274766555181), ('fracture', 0.4481020153878012), ('hybrid', 0.44354969611585648), ('reserves', 0.43237976894221625), ('pore', 0.41306412924555275), ('implicit', 0.39451667557558584), ('nonlinear', 0.38993545514310723), 

[('fracture', 0.3249153398419467), ('permeability', 0.17415927232499037), ('fractures', 0.14054805470635229), ('pore', 0.1247516812482815), ('recovery', 0.12243395433421446), ('drilling', 0.12217597382117212), ('wells', 0.12165391730929237), ('wellbore', 0.11371354786864064), ('injection', 0.11296516610600196), ('foam', 0.10983461246295145)]
Topic 1:
[('surfactant', 0.2577505696634087), ('recovery', 0.20114953906887603), ('foam', 0.17663763062516763), ('polymer', 0.14060278042965882), ('surfactants', 0.13530153593099264), ('permeability', 0.12394163080258837), ('steam', 0.10990829311802811), ('salinity', 0.10592756005456204), ('combustion', 0.10476482137003301), ('eor', 0.10384492783496463)]
Topic 2:
[('drilling', 0.25355417333961749), ('liquid', 0.2245519619750467), ('pipe', 0.2068082490258005), ('slug', 0.17556906842884654), ('foam', 0.11042314355358455), ('holdup', 0.10604900201698575), ('cement', 0.10435778067331958), ('wellbore', 0.099885613745927859), ('viscosity', 0.096005781588

In [5]:
text = "Steam injection is a widely used oil-recovery method that has been commercially successful in many types of heavy-oil reservoirs, including the oil sands of Alberta, Canada. Steam is very effective in delivering heat that is the key to heavy-oil mobilization. In the distant past in California, and also recently in Alberta, solvents were/are being used as additives to steam for additional viscosity reduction. The current applications are in field projects involving steam-assisted gravity drainage (SAGD) and cyclic steam stimulation (CSS).The past and present projects using solvents alone or in combination with steam are reviewed and evaluated, including enhanced solvent SAGD (ES-SAGD) and liquid addition to steam for enhancing recovery (LASER). The use of solvent in other processes, such as effective solvent extraction incorporating electromagnetic heating (ESEIEH) and after cold-heavy-oil production with sand (CHOPS), are also reviewed. The theories behind the use of solvents with steam are outlined. These postulate additional heavy-oil/bitumen mobilization; oil mobilization ahead of the steam front; and oil mobilization by solvent dispersion caused by frontal instability. The plausibility of the different approaches and solvent availability and economics are also discussed."
x = nmf_model.transform(vectorizer.transform([text]))[0]
print(x)

[ 0.          0.          0.          0.          0.          0.
  0.41162654  0.          0.          0.          0.          0.        ]


In [6]:
nmf_model.components_.shape

(12, 5166)

In [7]:
vectorizer.get_feature_names

<bound method CountVectorizer.get_feature_names of TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=None, min_df=5,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words={'be', 'you', 'to', 'll', 'd', "couldn't", 'hadn', 'just', 'above', 'most', 'they', 'wouldn', 'yours', 'for', 'if', 'won', "wouldn't", 'under', 'your', "won't", 'whom', "it's", 'before', 'both', 'which', 'couldn', 'where', 'theirs', 'isn', 's', "she's", 'doesn', 'yourselves', 'who', 'down... 'how', 'those', 'do', 'ma', "should've", 'out', 'we', 'myself', 'over', 'each', 'i', 'off', 'such'},
        strip_accents='unicode', sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)>

In [8]:
import numpy as np
topic_words = np.empty([nmf_model.components_.shape[0], 10])
for idx, topic in enumerate(nmf_model.components_):
    
    print([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-10 - 1:-1]])
#     topic_words[idx] = [vectorizer.get_feature_names()[i] for i in topic.argsort()[:-10 - 1:-1]]
# print(topic_words) 

['fracture', 'fractures', 'hydraulic', 'stress', 'fracturing', 'propagation', 'height', 'wells', 'multiple', 'geometry']
['surfactant', 'recovery', 'polymer', 'surfactants', 'foam', 'salinity', 'floods', 'flooding', 'eor', 'microemulsion']
['drilling', 'wellbore', 'cement', 'mud', 'bit', 'pipe', 'fluids', 'circulation', 'casing', 'annulus']
['basins', 'unconventional', 'resources', 'reserves', 'basin', 'frontier', 'uncertainty', 'decline', 'wells', 'recoverable']
['liquid', 'slug', 'pipe', 'holdup', 'stratified', 'velocity', 'loading', 'viscosity', 'foam', 'pipes']
['organic', 'kerogen', 'pore', 'rich', 'resistivity', 'nmr', 'rock', 'measurements', 'shales', 'porosity']
['combustion', 'steam', 'sagd', 'solvent', 'isc', 'bitumen', 'heavy', 'recovery', 'asphaltene', 'injection']
['optimization', 'history', 'seismic', 'wells', 'streamline', 'matching', 'grid', 'geologic', 'permeability', 'drainage']
['acid', 'conductivity', 'acidizing', 'stimulation', 'fracture', 'temperature', 'carbonate

In [9]:
from sklearn.metrics.pairwise import euclidean_distances
 
def most_similar(x, Z, top_n=5):
    dists = euclidean_distances(x.reshape(1, -1), Z)
    pairs = enumerate(dists[0])
    most_similar = sorted(pairs, key=lambda item: item[1])[:top_n]
    return np.array(most_similar)
 
final_df = pd.read_json('../data/final_database.json')
similarities = most_similar(x, nmf_Z)
similarities = similarities[similarities[:,0].argsort()] # sorting by id
document_ids = list(map(int, similarities[:,0]))
results_df = final_df[final_df.index.isin(document_ids)].sort_index()
results_df['similarity'] = similarities[:,1]
results_df.sort_values(by='similarity').drop(columns='similarity', axis=1)

Unnamed: 0,abstracts,email,faculty_name,faculty_title,google_scholar_link,office,page,paper_titles,phone,predicted_cluster_num,predicted_research_areas,research_areas
74,Expanding-solvent steam-assisted gravity drai...,okuno@utexas.edu,Ryosuke Okuno,Assistant Professor,https://scholar.google.com/citations?user=zqr5...,CPE 5.118B,https://www.pge.utexas.edu/facultystaff/profil...,Mechanistic simulation study of expanding-sol...,(512) 471-3250,7,"[asphaltene, chamber, recovery, heavy, isc, so...",Phase Behavior; Thermodynamics; Enhanced Oil R...
48,The gas-mobility-control aspects of foamed ga...,margot.gerritsen@stanford.edu,Margot Gerritsen,Associate Professor of Energy Resources Engine...,,"GESB 088, M10 Huang",https://pangea.stanford.edu/people/margot-gerr...,Modeling Foam Displacement With the Local-Equ...,"(650) 725-3542, (650) 725-2727",7,"[asphaltene, chamber, recovery, heavy, isc, so...",I s p e c i a l i z e i n r e n e w a b ...
49,Shale reservoirs have a significant fraction ...,barrufet@tamu.edu,Maria A. Barrufet,Professor,https://scholar.google.com/citations?user=VWtd...,RICH 407C,https://engineering.tamu.edu/petroleum/profile...,Effects of Thermodynamic and Rock Properties ...,979-845-0314,7,"[asphaltene, chamber, recovery, heavy, isc, so...",Multicomponent thermodynamic and transport phe...
7,Performance predictions of the In-Situ Combus...,hascakir@tamu.edu,Berna Hascakir,Assistant Professor,https://scholar.google.com/citations?user=wdJY...,RICH 401N,https://engineering.tamu.edu/petroleum/profile...,Water and aromatics fraction interaction at e...,979-845-6614,7,"[asphaltene, chamber, recovery, heavy, isc, so...",Heavy oil and oil shale recovery with enhanced...
64,"In chemical product design, application o...",nimir.elbashir@qatar.tamu.edu,Nimir Elbashir,Professor,https://scholar.google.com/citations?user=mO2O...,204F,https://engineering.tamu.edu/petroleum/profile...,"Special issue (GPS, 2016): The Fifth Internat...",974-4423-0128,2,"[reforming, reactor, geological, tpwl, emissio...",Dr. Elbashir is the Director of Texas A&M Engi...


In [10]:
document_ids

[7, 48, 49, 64, 74]

In [6]:
import pickle
from topic_model import MyTopicModel
with open('../data/pickle/pge_sklearn_LDA.pkl', 'rb') as f:
        topic_model = pickle.load(f)

with open('../data/pickle/pge_sklearn_LDA_vectorizer.pkl', 'rb') as f:
    topic_vectorizer = pickle.load(f)

final_df = pd.read_json('../data/json/final_sklearn_database_LDA.json')

In [13]:
x = topic_model.transform(topic_vectorizer.transform([text]))[0]
print(x)

[ 0.          0.          0.          0.          0.          0.
  0.41681185  0.          0.          0.          0.          0.        ]


In [15]:
topic_model.most_similar([text], topic_vectorizer, top_n=5)

array([[ 74.        ,   0.11711831],
       [ 48.        ,   0.18981306],
       [ 49.        ,   0.24327598],
       [  7.        ,   0.2675512 ],
       [ 64.        ,   0.27350828]])

In [None]:
topic_model.score

In [7]:
from topic_model import *

In [9]:
corpus = get_corpus('../data/json/majors_database.json')
vectorizer, matrix = vectorize_corpus(corpus, tf_idf=True, stem_lem=None, ngram_range=(1,1),
                                max_df=0.8, min_df=5, max_features=None)
model = MyTopicModel(n_topics=12, algorithm='LDA')
y_pred = model.fit_transform(matrix)

In [14]:
pyLDAvis.enable_notebook()
vis = model.visualize_lda_model(matrix, vectorizer, mds='tsne')
vis