In [18]:
from cleaning import database_cleaner
from nlp_pipeline import feature_matrix
from model import MyModel

import string
import numpy as np
import pandas as pd
import pickle

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss, confusion_matrix
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy  import linkage, dendrogram
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

In [2]:
tamu_df = database_cleaner('../data/tamu_database.json')
tamu_df.head()

Unnamed: 0,faculty_name,email,google_scholar_link,office,page,phone,faculty_title,paper_titles,abstracts,research_areas
0,A. Daniel Hill,danhill@tamu.edu,https://scholar.google.com/citations?user=EBnW...,RICH 1012,https://engineering.tamu.edu/petroleum/profile...,979-845-2244,Professor,Mechanism of wormholing and its optimal condi...,Acid stimulation is commonly used in carbonat...,Dr. Hill has five patents in oil recovery and ...
1,A. Rashid Hasan,rhasan@tamu.edu,https://scholar.google.com/citations?user=6lMX...,RICH 501E,https://engineering.tamu.edu/petroleum/profile...,979.847.8564,Professor,,,Wellbore Heat transferSystematic modeling of h...
2,Akhil Datta-Gupta,datta-gupta@tamu.edu,https://scholar.google.com/citations?user=Al-S...,RICH 401G,https://engineering.tamu.edu/petroleum/profile...,979-847-9030,University Distinguished Professor,Radius of Investigation and its Generalizatio...,The concept of radius of investigation is fun...,Dr. Datta-Gupta has research interests in rapi...
3,Albertus Retnanto,albertus.retnanto@qatar.tamu.edu,https://scholar.google.com/citations?user=kN7P...,204K,https://engineering.tamu.edu/petroleum/profile...,974-4423-0281,Associate Professor of the Practice,After-Closure Idiosyncrasies of Fracture C...,"Fracture Calibration Tests (FCT), are stra...",Field development and planning Production enha...
4,Aziz Rahman,aziz.rahman@qatar.tamu.edu,https://scholar.google.com/citations?user=PYRt...,204E,https://engineering.tamu.edu/petroleum/profile...,974-4423-0601,Assistant Professor,,,Flow assurance Multiphase pipe flow Wellbore h...


In [3]:
# For nlp, only retaining faculty_name, research_areas, paper_titles, abstracts
df = tamu_df[['faculty_name', 'research_areas', 'paper_titles', 'abstracts']]
df.head()

Unnamed: 0,faculty_name,research_areas,paper_titles,abstracts
0,A. Daniel Hill,Dr. Hill has five patents in oil recovery and ...,Mechanism of wormholing and its optimal condi...,Acid stimulation is commonly used in carbonat...
1,A. Rashid Hasan,Wellbore Heat transferSystematic modeling of h...,,
2,Akhil Datta-Gupta,Dr. Datta-Gupta has research interests in rapi...,Radius of Investigation and its Generalizatio...,The concept of radius of investigation is fun...
3,Albertus Retnanto,Field development and planning Production enha...,After-Closure Idiosyncrasies of Fracture C...,"Fracture Calibration Tests (FCT), are stra..."
4,Aziz Rahman,Flow assurance Multiphase pipe flow Wellbore h...,,


In [4]:
missing = df['paper_titles'] == ''
sum(missing)

16

In [5]:
# Working with non-missing entries i.e. 26 faculties
df_nlp = df[~missing]
len(df_nlp)

26

In [6]:
df_nlp.head()

Unnamed: 0,faculty_name,research_areas,paper_titles,abstracts
0,A. Daniel Hill,Dr. Hill has five patents in oil recovery and ...,Mechanism of wormholing and its optimal condi...,Acid stimulation is commonly used in carbonat...
2,Akhil Datta-Gupta,Dr. Datta-Gupta has research interests in rapi...,Radius of Investigation and its Generalizatio...,The concept of radius of investigation is fun...
3,Albertus Retnanto,Field development and planning Production enha...,After-Closure Idiosyncrasies of Fracture C...,"Fracture Calibration Tests (FCT), are stra..."
5,Berna Hascakir,Heavy oil and oil shale recovery with enhanced...,Water and aromatics fraction interaction at e...,Performance predictions of the In-Situ Combus...
6,David Schechter,Spraberry Trend Area Geological and petrophysi...,Gas Injection for EOR in Organic Rich Shales....,Laboratory experiments of gas injection in or...


# Clustering to find research topics
* Applying K-means to TF-IDF or bag-of-words features produces topic centroids. 
* For this sprint, we will perform topic modeling on abstracts using k-means and hierarchical clustering.

In [10]:
X = matrix
km = KMeans(10) 
y = km.fit_predict(X)

In [11]:
# The centroid to which the faculty maps to
y

array([1, 2, 1, 4, 7, 1, 5, 0, 2, 6, 5, 5, 1, 5, 9, 3, 5, 5, 0, 5, 3, 4,
       2, 5, 7, 8], dtype=int32)

In [12]:
# "topics" Kmeans has discovered i.e. the centroids
centroids = km.cluster_centers_
centroids

array([[0.00657833, 0.00238547, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.01207467, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00213336, 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.06116011, 0.03058006,
        0.03058006],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [13]:
centroids.shape

(10, 10028)

In [14]:
# But for topics we are only really interested in the most present words, 
# i.e. features/dimensions with the greatest representation in the centroid. 
# Print out the top ten words for each centroid
indices = np.argsort(centroids, axis=1)
top_ten_indices = indices[:, -10:]
top_ten_indices # in ascending order of importance

array([[2550, 8093, 9435, 7770, 9447, 2502, 1143, 7739, 7173, 4050],
       [9942, 9318, 3860, 9029, 1542,  507, 8678, 2079, 3967,  503],
       [9177, 3850, 8337, 7173, 7099, 2502, 5947, 9847,  880, 7740],
       [4388, 3878, 9029, 9847, 6396, 7099, 1573, 4483, 3078, 1621],
       [4050, 9807,  966, 4348, 8450, 5072, 1903, 8644, 7955, 6396],
       [9847, 3969, 7741, 7740, 7173, 5947, 8224, 3850, 3967, 4050],
       [6752, 4063, 6441, 3074, 5411, 4456, 8668, 1312, 7016, 1268],
       [8006, 8887, 9807, 3967, 9864, 7536, 8558, 9862, 4606, 6396],
       [7129, 9817, 8415, 6963, 4117, 3831, 5178, 4368, 7757,  906],
       [7979,  818, 4525, 4203, 7376, 2502,  592, 6707, 4072, 7336]])

In [15]:
# Go back to your vectorizer object to find out what words each of these features corresponds to.
# reverse the vocab to look-up keys using values
reverse_vocab = {}
for key, value in vectorizer.vocabulary_.items():
    reverse_vocab[value] = key
    
top_ten_features = np.array([reverse_vocab[index] for row in top_ten_indices for index in row])
top_ten_features = top_ten_features.reshape(len(centroids), -1)
top_ten_features

array([['decline', 'sec', 'uncertainty', 'resources', 'unconventional',
        'data', 'basins', 'reserves', 'production', 'gas'],
       ['wormhole', 'treatment', 'fluid', 'temperature', 'carbonate',
        'acidizing', 'stimulation', 'conductivity', 'fracture', 'acid'],
       ['time', 'flow', 'simulation', 'production', 'pressure', 'data',
        'model', 'well', 'approach', 'reservoir'],
       ['high', 'foams', 'temperature', 'well', 'oil', 'pressure',
        'casing', 'hpht', 'drilling', 'cement'],
       ['gas', 'water', 'asphaltene', 'heavy', 'solvent', 'isc',
        'combustion', 'steam', 'sagd', 'oil'],
       ['well', 'fractures', 'reservoirs', 'reservoir', 'production',
        'model', 'shale', 'flow', 'fracture', 'gas'],
       ['performance', 'gauge', 'operations', 'drill', 'limiters',
        'hoop', 'sticking', 'borehole', 'practices', 'bit'],
       ['saturation', 'surfactants', 'water', 'fracture', 'wettability',
        'recovery', 'spontaneous', 'wet', 'imbibi

# Testing model.py code with tamu_database

In [23]:
def get_data(filename):
    """Load raw data from a file and return training data and responses.
    Parameters
    ----------
    filename: The path to a csv file containing the raw text data and response.
    Returns
    -------
    X: A numpy array containing the text fragments used for training.
    y: A numpy array containing labels, used for model response.
    """
    df_cleaned = database_cleaner(filename)

    # For nlp, only retaining faculty_name, research_areas, paper_titles, abstracts
    df_filtered = df_cleaned[['faculty_name', 'research_areas', 'paper_titles', 'abstracts']]
    missing = df_filtered['paper_titles'] == ''
    num_missing = sum(missing)
    print(f'{num_missing} faculties have missing papers in {filename}')
    print('Running nlp-pipeline on faculties with non-missing papers...')

    df_nlp = df_filtered[~missing]

    # Choosing abstracts to predict topics for a professor
    corpus = df_nlp['abstracts'].values
    vectorizer, matrix = feature_matrix(corpus, tf_idf=True, stem_lem=None, ngram_range=(1,1),
                                    max_df=1.0, min_df=1, max_features=None)

    return vectorizer, matrix

In [50]:
with open('../data/tamu_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [51]:
vectorizer, matrix = get_data('../data/tamu_database.json')
model = MyModel(10)
y_pred = model.fit_predict(matrix)
y_pred

3 faculties have missing papers in ../data/tamu_database.json
Running nlp-pipeline on faculties with non-missing papers...


array([6, 7, 1, 6, 4, 4, 2, 6, 0, 5, 1, 3, 2, 4, 6, 4, 1, 7, 2, 2, 1, 2,
       4, 7, 4, 1, 2, 9, 4, 6, 2, 5, 1, 3, 8, 5, 1, 9, 1], dtype=int32)

In [52]:
model.top_n_features(vectorizer.vocabulary_, 10)

array([['al', 'geological', 'reservoirs', 'shale', 'mesh', 'gas',
        'hyperspectral', 'cliff', 'dolomite', 'bodies'],
       ['pressure', 'shale', 'wells', 'reservoirs', 'well', 'model',
        'data', 'production', 'gas', 'reservoir'],
       ['well', 'shale', 'reservoirs', 'reservoir', 'gas', 'production',
        'fractures', 'model', 'flow', 'fracture'],
       ['whirl', 'operations', 'torsional', 'well', 'laminated', 'wob',
        'practices', 'drillstring', 'drilling', 'bit'],
       ['model', 'fluid', 'steam', 'process', 'production', 'combustion',
        'sagd', 'phase', 'oil', 'gas'],
       ['basin', 'uruguay', 'fiscal', 'production', 'uruguayan',
        'frontier', 'unconventional', 'resources', 'basins', 'gas'],
       ['formation', 'temperature', 'proppant', 'carbonate', 'acidizing',
        'fluid', 'stimulation', 'conductivity', 'fracture', 'acid'],
       ['model', 'flow', 'well', 'gas', 'temperature', 'hpht',
        'drilling', 'casing', 'pressure', 'cement']

In [54]:
tamu_df = database_cleaner('../data/tamu_database.json')
missing = tamu_df['paper_titles'] == ''
tamu_df_not_missing = tamu_df[~missing]
tamu_df_not_missing = tamu_df_not_missing[['faculty_name', 'research_areas', 'paper_titles', 'abstracts']]
tamu_df_not_missing['predicted_research_areas'] = [top_ten_features[num] for num in y_pred]
tamu_df_not_missing[['faculty_name', 'research_areas', 'predicted_research_areas']]

Unnamed: 0,faculty_name,research_areas,predicted_research_areas
0,A. Daniel Hill,Dr. Hill has five patents in oil recovery and ...,"[reservoir, spacing, appraisal, drilling, obje..."
1,A. Rashid Hasan,Wellbore Heat transferSystematic modeling of h...,"[injection, gas, stresses, mechanical, rock, r..."
2,Akhil Datta-Gupta,Dr. Datta-Gupta has research interests in rapi...,"[recovery, pressure, gas, permeability, fractu..."
3,Albertus Retnanto,Field development and planning Production enha...,"[reservoir, spacing, appraisal, drilling, obje..."
4,Aziz Rahman,Flow assurance Multiphase pipe flow Wellbore h...,"[high, asp, reservoir, model, recovery, water,..."
5,Berna Hascakir,Heavy oil and oil shale recovery with enhanced...,"[high, asp, reservoir, model, recovery, water,..."
6,David Schechter,Spraberry Trend Area Geological and petrophysi...,"[data, rop, well, cement, model, pressure, fra..."
7,Ding Zhu,General production engineering Well stimulatio...,"[reservoir, spacing, appraisal, drilling, obje..."
8,Dominique Guerillot,Reservoir characterization and simulation Carb...,"[mnps, water, foams, adsorption, nanoparticles..."
9,Duane McVay,Risk and uncertainty assessment Unconventional...,"[responses, reservoir, connectivities, proxy, ..."


# Testing ut's model

In [34]:
with open('../data/ut_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [35]:
vectorizer, matrix = get_data('../data/ut_database.json')
model = MyModel(10)
y_pred = model.fit_predict(matrix)
y_pred

0 faculties have missing papers in ../data/ut_database.json
Running nlp-pipeline on faculties with non-missing papers...


array([8, 1, 2, 4, 0, 6, 9, 9, 1, 2, 4, 1, 1, 0, 1, 5, 4, 9, 7, 4, 3, 8],
      dtype=int32)

In [36]:
top_ten_features = model.top_n_features(vectorizer.vocabulary_, 10)
top_ten_features

array([['mnps', 'water', 'foams', 'adsorption', 'nanoparticles',
        'permeability', 'model', 'methane', 'hydrate', 'pore'],
       ['recovery', 'pressure', 'gas', 'permeability', 'fracture',
        'reservoir', 'flow', 'model', 'oil', 'co2'],
       ['data', 'rop', 'well', 'cement', 'model', 'pressure', 'fracture',
        'circulation', 'wellbore', 'drilling'],
       ['behavior', 'es', 'oil', 'steam', 'edge', 'phase', 'chamber',
        'solvent', 'bitumen', 'sagd'],
       ['high', 'asp', 'reservoir', 'model', 'recovery', 'water', 'foam',
        'polymer', 'surfactant', 'oil'],
       ['responses', 'reservoir', 'connectivities', 'proxy', 'injector',
        'stacking', 'locations', 'channel', 'well', 'object'],
       ['reservoir', 'spacing', 'appraisal', 'drilling', 'objectives',
        'optimal', 'well', 'decisions', 'decision', 'geosteering'],
       ['injection', 'gas', 'stresses', 'mechanical', 'rock',
        'reservoir', 'stress', 'hydrate', 'co2', 'coal'],
       ['m

### Mapping back research interests for a faculty

In [45]:
ut_df = database_cleaner('../data/ut_database.json')
ut_df = ut_df[['faculty_name', 'research_areas', 'paper_titles', 'abstracts']]
ut_df['predicted_research_areas'] = [top_ten_features[num] for num in y_pred]
ut_df[['faculty_name', 'research_areas', 'predicted_research_areas']]

Unnamed: 0,faculty_name,research_areas,predicted_research_areas
0,Carlos Torres-Verdin,Static and Dynamic Formation Evaluation Boreho...,"[method, neutron, borehole, properties, logs, ..."
1,David DiCarlo,Chemical EOR Gas Enhanced Oil Recovery Geologi...,"[recovery, pressure, gas, permeability, fractu..."
2,Eric van Oort,Drilling Well Completions and Rock Mechanics; ...,"[data, rop, well, cement, model, pressure, fra..."
3,Gary Pope,Environmental Engineering; Natural Gas Enginee...,"[high, asp, reservoir, model, recovery, water,..."
4,Hugh Daigle,Drilling Well Completions and Rock Mechanics; ...,"[mnps, water, foams, adsorption, nanoparticles..."
5,J.Eric Bickel,Decision and risk analysis economics value of ...,"[reservoir, spacing, appraisal, drilling, obje..."
6,John Foster,Rock Mechanics; Fundamental Processes; Natural...,"[peridynamics, results, fluid, proppant, model..."
7,Jon Olson,Reservoir Geomechanics Hydraulic Fracturing In...,"[peridynamics, results, fluid, proppant, model..."
8,Kamy Sepehrnoori,Computational Methods Reservoir Simulation Dev...,"[recovery, pressure, gas, permeability, fractu..."
9,Kenneth Gray,MSE technologies and ROP models,"[data, rop, well, cement, model, pressure, fra..."


# Testing stanford's model

In [46]:
with open('../data/stanford_model.pkl', 'rb') as f:
    model = pickle.load(f)

In [47]:
vectorizer, matrix = get_data('../data/stanford_database.json')
model = MyModel(10)
y_pred = model.fit_predict(matrix)
y_pred

0 faculties have missing papers in ../data/stanford_database.json
Running nlp-pipeline on faculties with non-missing papers...


array([2, 1, 5, 8, 0, 2, 1, 9, 7, 4, 3, 6], dtype=int32)

In [48]:
model.top_n_features(vectorizer.vocabulary_, 10)

array([['transport', 'reactive', 'pe', 'manufacturing', 'continuum',
        'advection', 'multiscale', 'reaction', 'pore', 'scale'],
       ['simulation', 'injection', 'isc', 'reservoir', 'gas', 'recovery',
        'steam', 'foam', 'combustion', 'oil'],
       ['solar', 'operations', 'system', 'capture', 'model', 'gas',
        'tpwl', 'energy', 'optimization', 'co2'],
       ['isotope', 'mtds', 'slope', 'syncline', 'marine', 'climate',
        'songliao', 'terrestrial', 'basin', 'cretaceous'],
       ['fuel', 'energy', 'control', 'engine', 'orc', 'charge',
        'vehicle', 'model', 'aging', 'battery'],
       ['uncertainty', 'stochastic', 'computational', 'equations',
        'model', 'noise', 'diffusion', 'coupling', 'moments', 'random'],
       ['parameters', 'inverse', 'model', 'reservoir', 'stress', 'panel',
        'lapse', 'simpleware', 'data', 'seismic'],
       ['imbibition', 'pressure', 'storage', 'scale', 'injection',
        'core', 'leakage', 'capillary', 'permeability'

In [49]:
stanford_df = database_cleaner('../data/stanford_database.json')
stanford_df = stanford_df[['faculty_name', 'research_areas', 'paper_titles', 'abstracts']]
stanford_df['predicted_research_areas'] = [top_ten_features[num] for num in y_pred]
stanford_df[['faculty_name', 'research_areas', 'predicted_research_areas']]

Unnamed: 0,faculty_name,research_areas,predicted_research_areas
0,Adam Brandt,G r e e n h o u s e g a s e m i s s i o n ...,"[data, rop, well, cement, model, pressure, fra..."
1,Anthony Kovscek,I a m i n t e r e s t e d i n t h e ...,"[recovery, pressure, gas, permeability, fractu..."
2,Daniel Tartakovsky,E n v i r o n m e n t a l f l u i d m e c ...,"[responses, reservoir, connectivities, proxy, ..."
3,Hamdi Tchelepi,C u r r e n t r e s e a r c h a c t i v i ...,"[method, neutron, borehole, properties, logs, ..."
4,Ilenia Battiato,E n e r g y a n d e n v i r o n m e n t ...,"[mnps, water, foams, adsorption, nanoparticles..."
5,Louis Durlofsky,G e n e r a l r e s e r v o i r s i m u l ...,"[data, rop, well, cement, model, pressure, fra..."
6,Margot Gerritsen,I s p e c i a l i z e i n r e n e w a b ...,"[recovery, pressure, gas, permeability, fractu..."
7,Roland Horne,"W e l l T e s t i n g , O p t i m i s a t ...","[peridynamics, results, fluid, proppant, model..."
8,Sally Benson,M y r e s e a r c h i s f o c u s e d ...,"[injection, gas, stresses, mechanical, rock, r..."
9,Simona Onori,"M o d e l i n g , c o n t r o l a n d o ...","[high, asp, reservoir, model, recovery, water,..."


# Combining the three dfs:

In [108]:
from combine_databases import add_database
from model import get_data, MyModel

current_db_path = '../data/ut_database.json'
new_db_paths = ['../data/stanford_database.json', '../data/tamu_database.json']
combined_db_path = '../data/pge_database.json'
add_database(current_db_path, new_db_paths, combined_db_path)

In [100]:
vectorizer, matrix = get_data(combined_db_path)
model = MyModel(13)
y_pred = model.fit_predict(matrix)

0 faculties have missing papers in ../data/pge_database.json
Running nlp-pipeline on faculties with non-missing papers...


In [101]:
y_pred

array([ 7, 10,  5,  2,  7,  5, 10,  5,  0,  6,  1, 10,  7, 10, 10, 10,  4,
        4,  1, 10, 10,  8,  7, 10, 10,  6,  2,  7,  9,  4,  8, 10, 10,  2,
        3, 10,  3,  4,  1, 10,  8,  4,  4,  5,  5,  8, 10, 10,  2,  2, 10,
        1,  3,  8,  8,  5,  4, 10, 11,  1, 10, 10,  5,  8,  4,  0,  8,  4,
       10,  2, 10,  8, 12,  7,  0], dtype=int32)

In [102]:
top_ten_features = model.top_n_features(vectorizer.vocabulary_, 10)
top_ten_features

array([['rich', 'samples', 'logs', 'resistivity', 'measurements', 'rock',
        'properties', 'organic', 'nmr', 'kerogen'],
       ['phase', 'model', 'recovery', 'permeability', 'water', 'co2',
        'polymer', 'foam', 'surfactant', 'oil'],
       ['reservoirs', 'seismic', 'gas', 'geosteering', 'model',
        'approach', 'production', 'data', 'well', 'reservoir'],
       ['flow', 'propagation', 'fluid', 'fracturing', 'model',
        'proppant', 'stress', 'hydraulic', 'fractures', 'fracture'],
       ['mud', 'fluid', 'data', 'wellbore', 'well', 'casing', 'pressure',
        'bit', 'cement', 'drilling'],
       ['recovery', 'co2', 'solvent', 'bitumen', 'isc', 'gas', 'sagd',
        'combustion', 'steam', 'oil'],
       ['diffusion', 'continuum', 'transport', 'advection', 'multiscale',
        'moments', 'random', 'reaction', 'coupling', 'scale'],
       ['temperature', 'proppant', 'treatment', 'wormhole', 'carbonate',
        'acidizing', 'stimulation', 'conductivity', 'fracture',

In [103]:
pge_df = database_cleaner('../data/pge_database.json')
pge_df = pge_df[['faculty_name', 'research_areas', 'paper_titles', 'abstracts']]
pge_df['predicted_research_areas'] = [top_ten_features[num] for num in y_pred]
pge_df[['faculty_name', 'research_areas', 'predicted_research_areas']]

Unnamed: 0,faculty_name,research_areas,predicted_research_areas
0,A. Daniel Hill,Dr. Hill has five patents in oil recovery and ...,"[temperature, proppant, treatment, wormhole, c..."
1,A. Rashid Hasan,Wellbore Heat transferSystematic modeling of h...,"[reservoirs, pressure, oil, shale, production,..."
2,Adam Brandt,G r e e n h o u s e g a s e m i s s i o n ...,"[recovery, co2, solvent, bitumen, isc, gas, sa..."
3,Akhil Datta-Gupta,Dr. Datta-Gupta has research interests in rapi...,"[reservoirs, seismic, gas, geosteering, model,..."
4,Albertus Retnanto,Field development and planning Production enha...,"[temperature, proppant, treatment, wormhole, c..."
5,Anthony Kovscek,I a m i n t e r e s t e d i n t h e ...,"[recovery, co2, solvent, bitumen, isc, gas, sa..."
6,Aziz Rahman,Flow assurance Multiphase pipe flow Wellbore h...,"[reservoirs, pressure, oil, shale, production,..."
7,Berna Hascakir,Heavy oil and oil shale recovery with enhanced...,"[recovery, co2, solvent, bitumen, isc, gas, sa..."
8,Carlos Torres-Verdin,Static and Dynamic Formation Evaluation Boreho...,"[rich, samples, logs, resistivity, measurement..."
9,Daniel Tartakovsky,E n v i r o n m e n t a l f l u i d m e c ...,"[diffusion, continuum, transport, advection, m..."


In [124]:
from model import MyModel

with open('../data/pge_model.pkl', 'rb') as f:
        model = pickle.load(f)

with open('../data/pge_vectorizer.pkl', 'rb') as f:
    vectorizer = pickle.load(f)

In [125]:
model.top_n_features(vectorizer.vocabulary_, 10)

array([['bubble', 'flash', 'algorithm', 'asphaltene', 'shale',
        'placement', 'pore', 'confinement', 'nm', 'condensate'],
       ['permeability', 'propagation', 'proppant', 'conductivity',
        'fracturing', 'stress', 'hydraulic', 'acid', 'fractures',
        'fracture'],
       ['numerical', 'stimulation', 'hydraulic', 'unconventional',
        'reserves', 'permeability', 'fractures', 'wells', 'fracture',
        'shale'],
       ['object', 'connectivities', 'resistivity', 'porosity', 'pod',
        'co2', 'seismic', 'optimization', 'crm', 'archie'],
       ['stress', 'safety', 'temperature', 'mud', 'operations', 'bit',
        'wellbore', 'casing', 'cement', 'drilling'],
       ['imbibition', 'fracture', 'capillary', 'relative', 'pore',
        'foam', 'saturation', 'injection', 'permeability', 'co2'],
       ['adsorption', 'rich', 'nmr', 'samples', 'permeability',
        'multiscale', 'shale', 'kerogen', 'organic', 'pore'],
       ['product', 'electricity', 'design', 'sola

In [None]:
new_stopwords = ['commonly', 'used', 'either',  ]