In [1]:
import pickle as pkl
from gensim.models import LdaModel
import numpy as np

In [2]:
# Optimal number of topics
nTopics = 45

# Load Data

In [3]:
# Load model
lda_model = LdaModel.load(f'trained_models/trained_lda_model_new_{nTopics}')

In [4]:
# Load topic distributions
topic_distributions = np.load(f'data/topic_distributions_broad_{lda_model.num_topics}.npy')

In [5]:
# Load raw corpus dataframe
with open('data/cleaned_corpus_broad.pkl', 'rb') as f:
    corpus = pkl.load(f)

In [6]:
# load raw corpus dataframe
with open('data/raw_corpus_broad.pkl', 'rb') as f:
    corpus_df = pkl.load(f)

# Subjective Topic Naming

In [7]:
# look at top key words in each topic
topics = lda_model.show_topics(formatted=False, num_topics=nTopics, num_words=20)
topics

[(0,
  [('sample', 0.074252866),
   ('water', 0.06926924),
   ('quality', 0.023564577),
   ('analysis', 0.020014973),
   ('site', 0.019820834),
   ('detect', 0.011719235),
   ('different', 0.01171241),
   ('identify', 0.011492041),
   ('indicator', 0.011341908),
   ('collect', 0.011238112),
   ('use', 0.010611661),
   ('type', 0.010336199),
   ('reserve', 0.010050453),
   ('group', 0.009650041),
   ('right', 0.008674019),
   ('source', 0.008312369),
   ('datum', 0.007989801),
   ('contamination', 0.007851664),
   ('high', 0.007746825),
   ('level', 0.0075760307)]),
 (1,
  [('sediment', 0.18346079),
   ('particle', 0.062737994),
   ('size', 0.034225166),
   ('transport', 0.033228543),
   ('suspend', 0.028920319),
   ('load', 0.020857766),
   ('sand', 0.01915368),
   ('bed', 0.017501064),
   ('deposition', 0.015249698),
   ('fine', 0.0123372795),
   ('concentration', 0.011736919),
   ('material', 0.010564002),
   ('rate', 0.010298336),
   ('distribution', 0.00928985),
   ('erosion', 0.00

In [8]:
# Look at top papers in each topic - must be done outside jupyter
for topic in range(nTopics):
    topic_idx = (-topic_distributions[:,topic]).argsort()
    corpus_df.loc[topic_idx[:100],'Title'].to_csv(f'investigate_topics/30topics_{nTopics}_topic_{topic}.csv')

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]
  after removing the cwd from sys.path.


In [9]:
# Topic 0: Seasonal Variability
# Topic 1: Precipitation Observations
# Topic 2: Hydrological Processes
# Topic 3: Land Surface Fluxes
# Topic 4: Systems Hydrology
# Topic 5: Water Scarcity & Impacts
# Topic 6: Floods
# Topic 7: Water Resources Management
# Topic 8: Ecohydrology
# Topic 9: Rainfall-Runoff
# Topic 10: Climate Change & Impacts
# Topic 11: Hydrological Modeling
# Topic 12: Water Quality
# Topic 13: Soil Moisture
# Topic 14: Solute Transport
# Topic 15: Estuarine & Coastal Processes
# Topic 16: Watershed Hydrology
# Topic 17: Forecasting
# Topic 18: Sediment Erosion & Transport
# Topic 19: Spatial & Temporal Variability
# Topic 20: Statistical Methods
# Topic 21: Hydrogeology
# Topic 22: Vadose Zone
# Topic 23: Modeling & Optimization
# Topic 24: Snowmelt Hydrology
# Topic 25: Land Cover
# Topic 26: Glaciology 
# Topic 27: Stochastic Processes
# Topic 28: Groundwater Storage & Recharge
# Topic 29: Quantitative Methods

In [None]:
#30 topics
#[
'Droughts & Impacts',
'Precipitation Observations',
'Hydrological Processes',
'Land Surface Fluxes',
'Water Resources Management',
'Groundwater & Aquifers',
'Rainfall-Runoff',
'Agriculture & Irrigation',
'Ecohydrology',
'Rivers and Streamflow',
'Climate Change & Impacts',
'Numerical Methods & Solutions',
'Water Quality',
'Soil Moisture',
'Solute Transport',
'Estuarine & Coastal Processes',
'Watershed Hydrology',
'Forecasting',
'Sediment Erosion & Transport',
'Spatial & Temporal Variability',
'Data Sampling & Collection',
'Surface-GW Interactions',
'Slopes & Drainage',
'Temperature & Heat Fluxes',
'Snowmelt Hydrology',
'Land Cover',
'Glaciology', 
'Stochastic Processes',
'Groundwater Tracing & Recharge',
'Quantitative Methods & Modeling',
]

In [None]:
#45 topics
#topic_names = [
'Water Quality',
'Sediment Transport',
'Wastewater Treatment',
'Flood Risk & Assessment',
'Hydrogeology',
'Coastal Hydrology', 
'River Flow',
'Wetland & Ecology',
'Runoff Quality',
'Rainfall-Runoff',
'Urban Drainage',
'Systems Hydrology',
'Surface-GW Interactions',
'Irrigation Water Management',
'Drought & Water Scarcity',
'Climate Change Impacts',
'Gauginng & Monitoring',
'Forecasting',
'Glaciology',
'Salinity',
'Peatlands Mapping & Monitoring',
'Spatial Variability',
'Land Surface Flux',
'Solute Transport',
'Water Resources Management',
'Numerical Modeling',
'Hydrochemistry',
'Pollutant Removal',
'Groundwater Recharge',
'Uncertainty',
'Land Cover',
'Modeling & Calibration',
'Soil Moisture',
'Water Storage & Budgeting',
'Aquifers & Abstraction',
'Microbiology',
'Streamflow',
'Erosion',
'Dynamic Processes',
'Temporal Variability',
'Spatial Variability of Precipitation',
'Rainfall Intensity & Measurement',
'Watershed Hydrology',
'Hydraulics',
'Quantitative Analysis',
]
