In [105]:
from cleaning import database_cleaner
from nlp_pipeline import feature_matrix

import string
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer

from sklearn.utils import shuffle
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import log_loss, confusion_matrix
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

from scipy.spatial.distance import pdist, squareform
from scipy.cluster.hierarchy  import linkage, dendrogram
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples

In [90]:
tamu_df = database_cleaner('../data/tamu_database.json')
tamu_df.head()

Unnamed: 0,faculty_name,email,google_scholar_link,office,page,phone,faculty_title,paper_titles,abstracts,research_areas
0,A. Daniel Hill,danhill@tamu.edu,https://scholar.google.com/citations?user=EBnW...,RICH 1012,https://engineering.tamu.edu/petroleum/profile...,979-845-2244,Professor,Mechanism of wormholing and its optimal condi...,Acid stimulation is commonly used in carbonat...,Dr. Hill has five patents in oil recovery and ...
1,A. Rashid Hasan,rhasan@tamu.edu,https://scholar.google.com/citations?user=6lMX...,RICH 501E,https://engineering.tamu.edu/petroleum/profile...,979.847.8564,Professor,,,Wellbore Heat transferSystematic modeling of h...
2,Akhil Datta-Gupta,datta-gupta@tamu.edu,https://scholar.google.com/citations?user=Al-S...,RICH 401G,https://engineering.tamu.edu/petroleum/profile...,979-847-9030,University Distinguished Professor,Radius of Investigation and its Generalizatio...,The concept of radius of investigation is fun...,Dr. Datta-Gupta has research interests in rapi...
3,Albertus Retnanto,albertus.retnanto@qatar.tamu.edu,https://scholar.google.com/citations?user=kN7P...,204K,https://engineering.tamu.edu/petroleum/profile...,974-4423-0281,Associate Professor of the Practice,After-Closure Idiosyncrasies of Fracture C...,"Fracture Calibration Tests (FCT), are stra...",Field development and planning Production enha...
4,Aziz Rahman,aziz.rahman@qatar.tamu.edu,https://scholar.google.com/citations?user=PYRt...,204E,https://engineering.tamu.edu/petroleum/profile...,974-4423-0601,Assistant Professor,,,Flow assurance Multiphase pipe flow Wellbore h...


In [91]:
# For nlp, only retaining faculty_name, research_areas, paper_titles, abstracts
df = tamu_df[['faculty_name', 'research_areas', 'paper_titles', 'abstracts']]
df.head()

Unnamed: 0,faculty_name,research_areas,paper_titles,abstracts
0,A. Daniel Hill,Dr. Hill has five patents in oil recovery and ...,Mechanism of wormholing and its optimal condi...,Acid stimulation is commonly used in carbonat...
1,A. Rashid Hasan,Wellbore Heat transferSystematic modeling of h...,,
2,Akhil Datta-Gupta,Dr. Datta-Gupta has research interests in rapi...,Radius of Investigation and its Generalizatio...,The concept of radius of investigation is fun...
3,Albertus Retnanto,Field development and planning Production enha...,After-Closure Idiosyncrasies of Fracture C...,"Fracture Calibration Tests (FCT), are stra..."
4,Aziz Rahman,Flow assurance Multiphase pipe flow Wellbore h...,,


In [92]:
missing = df['paper_titles'] == ''
sum(missing)

16

In [93]:
# Working with non-missing entries i.e. 26 faculties
df_nlp = df[~missing]
len(df_nlp)

26

In [94]:
df_nlp.head()

Unnamed: 0,faculty_name,research_areas,paper_titles,abstracts
0,A. Daniel Hill,Dr. Hill has five patents in oil recovery and ...,Mechanism of wormholing and its optimal condi...,Acid stimulation is commonly used in carbonat...
2,Akhil Datta-Gupta,Dr. Datta-Gupta has research interests in rapi...,Radius of Investigation and its Generalizatio...,The concept of radius of investigation is fun...
3,Albertus Retnanto,Field development and planning Production enha...,After-Closure Idiosyncrasies of Fracture C...,"Fracture Calibration Tests (FCT), are stra..."
5,Berna Hascakir,Heavy oil and oil shale recovery with enhanced...,Water and aromatics fraction interaction at e...,Performance predictions of the In-Situ Combus...
6,David Schechter,Spraberry Trend Area Geological and petrophysi...,Gas Injection for EOR in Organic Rich Shales....,Laboratory experiments of gas injection in or...


# NLP pipeline

In [110]:
corpus = df_nlp['abstracts'].values

In [111]:
vectorizer, matrix = feature_matrix(corpus, tf_idf=True, stem_lem=None, ngram_range=(1,1), 
                                    max_df=1.0, min_df=1, max_features=None)

# Clustering to find research topics
* Applying K-means to TF-IDF or bag-of-words features produces topic centroids. 
* For this sprint, we will perform topic modeling on abstracts using k-means and hierarchical clustering.

In [112]:
X = matrix
km = KMeans(10) 
y = km.fit_predict(X)

In [120]:
# The centroid to which the faculty maps to
y

array([3, 4, 3, 2, 9, 3, 5, 0, 4, 6, 1, 1, 3, 5, 7, 6, 1, 5, 0, 5, 6, 2,
       4, 5, 9, 8], dtype=int32)

In [113]:
# "topics" Kmeans has discovered i.e. the centroids
centroids = km.cluster_centers_
centroids

array([[0.00657833, 0.00238547, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.0022097 , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.00738742, 0.        , 0.00353283, ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.06116011, 0.03058006,
        0.03058006],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [114]:
centroids.shape

(10, 10028)

In [115]:
# But for topics we are only really interested in the most present words, 
# i.e. features/dimensions with the greatest representation in the centroid. 
# Print out the top ten words for each centroid
indices = np.argsort(centroids, axis=1)
top_ten_indices = indices[:, -10:]
top_ten_indices # in ascending order of importance

array([[2550, 8093, 9435, 7770, 9447, 2502, 1143, 7739, 7173, 4050],
       [7741, 4115, 3860, 8224, 7740, 3850, 9847, 4512, 7173, 4050],
       [4050, 9807,  966, 4348, 8450, 5072, 1903, 8644, 7955, 6396],
       [9942, 9318, 3860, 9029, 1542,  507, 8678, 2079, 3967,  503],
       [9177, 3850, 8337, 7173, 7099, 2502, 5947, 9847,  880, 7740],
       [9777, 5672, 7740, 7741, 3969, 3850, 8224, 5947, 4050, 3967],
       [9610, 9847, 6441, 1573, 7016, 7099, 1268, 4483, 1621, 3078],
       [7979,  818, 4525, 4203, 7376, 2502,  592, 6707, 4072, 7336],
       [7129, 9817, 8415, 6963, 4117, 3831, 5178, 4368, 7757,  906],
       [8006, 8887, 9807, 3967, 9864, 7536, 8558, 9862, 4606, 6396]])

In [117]:
# Go back to your vectorizer object to find out what words each of these features corresponds to.
# reverse the vocab to look-up keys using values
reverse_vocab = {}
for key, value in vectorizer.vocabulary_.items():
    reverse_vocab[value] = key
    
top_ten_features = np.array([reverse_vocab[index] for row in top_ten_indices for index in row])
top_ten_features = top_ten_features.reshape(len(centroids), -1)
top_ten_features

array([['decline', 'sec', 'uncertainty', 'resources', 'unconventional',
        'data', 'basins', 'reserves', 'production', 'gas'],
       ['reservoirs', 'geomechanics', 'fluid', 'shale', 'reservoir',
        'flow', 'well', 'hydrate', 'production', 'gas'],
       ['gas', 'water', 'asphaltene', 'heavy', 'solvent', 'isc',
        'combustion', 'steam', 'sagd', 'oil'],
       ['wormhole', 'treatment', 'fluid', 'temperature', 'carbonate',
        'acidizing', 'stimulation', 'conductivity', 'fracture', 'acid'],
       ['time', 'flow', 'simulation', 'production', 'pressure', 'data',
        'model', 'well', 'approach', 'reservoir'],
       ['vug', 'matrix', 'reservoir', 'reservoirs', 'fractures', 'flow',
        'shale', 'model', 'gas', 'fracture'],
       ['used', 'well', 'operations', 'casing', 'practices', 'pressure',
        'bit', 'hpht', 'cement', 'drilling'],
       ['sand', 'anionic', 'hydrogels', 'grains', 'quartz', 'data',
        'adsorption', 'pec', 'gel', 'pvs'],
       ['princ

In [None]:

vectorizer, matrix = get_data('../data/tamu_database.json')
model = MyModel(n=10)
y_pred = model.fit_predict(matrix)