# Topic Modeling using sklearn

In [8]:
# Source: http://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from topic_model import get_corpus, vectorize_corpus
from time import time
import pandas as pd

In [11]:
corpus = get_corpus('../data/pge_database.json')
vectorizer, data_vectorized = vectorize_corpus(corpus, tf_idf=False, stem_lem='lem', ngram_range=(1,1),
                                    max_df=0.8, min_df=5, max_features=None)

0 faculties have missing papers in ../data/pge_database.json
Running nlp-pipeline on faculties with non-missing papers...


In [34]:
NUM_TOPICS = 12

# Build a Latent Dirichlet Allocation Model
lda_model = LatentDirichletAllocation(n_components=NUM_TOPICS, max_iter=10, learning_method='online')
lda_Z = lda_model.fit_transform(data_vectorized)
 
# Build a Non-Negative Matrix Factorization Model
nmf_model = NMF(n_components=NUM_TOPICS)
nmf_Z = nmf_model.fit_transform(data_vectorized)

# Build a Latent Semantic Indexing Model
lsi_model = TruncatedSVD(n_components=NUM_TOPICS)
lsi_Z = lsi_model.fit_transform(data_vectorized)

In [36]:
nmf_Z

array([[  9.44315742e-02,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          5.71839334e-01,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   2.75585116e-02,
          3.50217275e-01,   0.00000000e+00,   4.61852066e-02,
          7.98723143e-02,   9.52715419e-02,   9.33624053e-03,
          2.81229957e-02,   0.00000000e+00,   0.00000000e+00],
       [  0.00000000e+00,   0.00000000e+00,   5.98930106e-02,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   6.14613248e-01,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
       [  3.39448889e-02,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00,
          0.00000000e+00,   6.55497964e-01,   4.97246678e-04,
          0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
    

In [13]:
def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):
        print("Topic %d:" % (idx))
        print([(vectorizer.get_feature_names()[i], topic[i])
                        for i in topic.argsort()[:-top_n - 1:-1]])
 
print("LDA Model:")
print_topics(lda_model, vectorizer)
print("=" * 20)
 
print("NMF Model:")
print_topics(nmf_model, vectorizer)
print("=" * 20)
 
print("LSI Model:")
print_topics(lsi_model, vectorizer)
print("=" * 20)

LDA Model:
Topic 0:
[('combustion', 0.42704519739551583), ('foam', 0.39535921658975576), ('steam', 0.39518920160016641), ('experiments', 0.38267015079800526), ('seismic', 0.38206513222154637), ('co2', 0.35932842980317758), ('heavy', 0.35475778276488318), ('polymer', 0.35412610400808775), ('situ', 0.35194381626118576), ('gel', 0.34924924838275012)]
Topic 1:
[('co2', 0.38783858762222162), ('foam', 0.35396571437263791), ('significance', 0.34920984910591896), ('storage', 0.34234498728981871), ('occur', 0.33839258204200962), ('injection', 0.33668960441443613), ('heated', 0.33407901916456728), ('eor', 0.33038884002836139), ('bed', 0.32912536460671798), ('fracture', 0.3289242484020809)]
Topic 2:
[('rheology', 0.35086109969783569), ('daily', 0.34093329499096336), ('original', 0.33540312329046462), ('voids', 0.33456795169854403), ('focused', 0.33438277998441973), ('cluster', 0.33266110700439561), ('dts', 0.32959471344300234), ('updates', 0.32881588454942728), ('quantified', 0.32822518455652044)

[('drilling', 0.32594309471444177), ('cement', 0.23526878342870433), ('casing', 0.19242940593345456), ('wellbore', 0.16890892977718588), ('acid', 0.16482464302599611), ('bit', 0.13905227966145617), ('mud', 0.12188027910472765), ('temperature', 0.10661778740533862), ('circulation', 0.097232597016016123), ('practices', 0.091888075465116625)]
Topic 4:
[('pore', 0.22909663043055853), ('hydrate', 0.19312569503984145), ('organic', 0.14703255240007998), ('multiscale', 0.1384276232444715), ('porous', 0.13015102518966132), ('kerogen', 0.1231710936583989), ('media', 0.11524996151181251), ('shear', 0.095126754158068233), ('nmr', 0.090908843704422843), ('resistivity', 0.082168851886853675)]
Topic 5:
[('sagd', 0.26714291539971463), ('steam', 0.2480303571767862), ('combustion', 0.2457941983891748), ('bitumen', 0.14290985701441666), ('fracture', 0.13979821449499263), ('solvent', 0.12973097607170034), ('isc', 0.12588427695261359), ('heavy', 0.10624677932979569), ('temperature', 0.096706129826908821), 

In [17]:
text = "Steam injection is a widely used oil-recovery method that has been commercially successful in many types of heavy-oil reservoirs, including the oil sands of Alberta, Canada. Steam is very effective in delivering heat that is the key to heavy-oil mobilization. In the distant past in California, and also recently in Alberta, solvents were/are being used as additives to steam for additional viscosity reduction. The current applications are in field projects involving steam-assisted gravity drainage (SAGD) and cyclic steam stimulation (CSS).The past and present projects using solvents alone or in combination with steam are reviewed and evaluated, including enhanced solvent SAGD (ES-SAGD) and liquid addition to steam for enhancing recovery (LASER). The use of solvent in other processes, such as effective solvent extraction incorporating electromagnetic heating (ESEIEH) and after cold-heavy-oil production with sand (CHOPS), are also reviewed. The theories behind the use of solvents with steam are outlined. These postulate additional heavy-oil/bitumen mobilization; oil mobilization ahead of the steam front; and oil mobilization by solvent dispersion caused by frontal instability. The plausibility of the different approaches and solvent availability and economics are also discussed."
x = nmf_model.transform(vectorizer.transform([text]))[0]
print(x)

[ 0.          0.          0.          0.          0.          0.40032679
  0.          0.          0.          0.          0.          0.        ]


In [37]:
x.argsort()

[autoreload of topic_model failed: Traceback (most recent call last):
  File "/anaconda3/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 245, in check
    superreload(m, reload, self.old_objects)
  File "/anaconda3/lib/python3.6/site-packages/IPython/extensions/autoreload.py", line 368, in superreload
    module = reload(module)
  File "/anaconda3/lib/python3.6/imp.py", line 315, in reload
    return importlib.reload(module)
  File "/anaconda3/lib/python3.6/importlib/__init__.py", line 166, in reload
    _bootstrap._exec(spec, module)
  File "<frozen importlib._bootstrap>", line 618, in _exec
  File "<frozen importlib._bootstrap_external>", line 674, in exec_module
  File "<frozen importlib._bootstrap_external>", line 781, in get_code
  File "<frozen importlib._bootstrap_external>", line 741, in source_to_code
  File "<frozen importlib._bootstrap>", line 219, in _call_with_frames_removed
  File "/Users/Neha/Documents/GitHub/professor-profiler/src/topic_model.py", li

array([ 0,  1,  2,  3,  4,  6,  7,  8,  9, 10, 11,  5])

In [22]:
from sklearn.metrics.pairwise import euclidean_distances
 
def most_similar(x, Z, top_n=5):
    dists = euclidean_distances(x.reshape(1, -1), Z)
    pairs = enumerate(dists[0])
    most_similar = sorted(pairs, key=lambda item: item[1])[:top_n]
    return most_similar
 
final_df = pd.read_json('../data/final_database.json')
similarities = most_similar(x, nmf_Z)
for (document_id, similarity) in similarities:
    print(final_df.iloc[document_id].loc[['faculty_name', 'research_areas']])

faculty_name                                             Sara Abedi
research_areas    Mechanics and physics of geomaterials Multisca...
Name: 65, dtype: object
faculty_name                                           Michael King
research_areas    3D reservoir modeling and characterization Pre...
Name: 48, dtype: object
faculty_name                                         Eduardo Gildin
research_areas    Model reduction of large scale dynamical syste...
Name: 15, dtype: object
faculty_name                                          Michael Pyrcz
research_areas    Integrated Reservoir Characterization; Unconve...
Name: 49, dtype: object
faculty_name                                    Dominique Guerillot
research_areas    Reservoir characterization and simulation Carb...
Name: 13, dtype: object


# Topic modeling using Gensim