In [1]:
from gensim.models import LdaModel
import pickle as pkl
import numpy as np
# # import matplotlib.colors as mcolors
from nltk.corpus import stopwords
from wordcloud import WordCloud #, STOPWORDS
from matplotlib import pyplot as plt
# # import matplotlib.colors as mcolors
# # from tqdm import tqdm
# # # import pandas as pd
# # # import matplotlib.pyplot as plt

In [2]:
# Optimal number of topics
nTopics = 30

In [3]:
# Define colors to associate with each topic
custom_colors = {
 'burlywood': '#DEB887',
 'chocolate': '#D2691E',
 'crimson': '#DC143C',
 'darkgreen': '#006400',
 'darkorange': '#FF8C00',
 'darkslategrey': '#2F4F4F',
 'deepskyblue': '#00BFFF',
 'dimgray': '#696969',
 'firebrick': '#B22222',
 'gold': '#FFD700',
 'goldenrod': '#DAA520',
 'lawngreen': '#7CFC00',
 'lightcoral': '#F08080',
 'lightpink': '#FFB6C1',
 'mediumvioletred': '#C71585',
 'orangered': '#FF4500',
 'orchid': '#DA70D6',
 'royalblue': '#4169E1',
 'slateblue': '#6A5ACD',
 'springgreen': '#00FF7F',
 'steelblue': '#4682B4',
 'teal': '#008080',
 'turquoise': '#40E0D0',
 'yellow': '#FFFF00',
 'blueviolet': '#8A2BE2',
 'yellowgreen': '#9ACD32'}

# turn into a list
colorlist = []
for color in custom_colors.values():
    colorlist.append(tuple(int(color.lstrip('#')[i:i+2], 16) for i in (0, 2, 4)))

# Load Data

In [4]:
# Load model
lda_model = LdaModel.load(f'trained_models/trained_lda_model_{nTopics}')

In [5]:
# Load topic distributions
topic_distributions = np.load(f'data/topic_distributions_{lda_model.num_topics}.npy')

In [6]:
# Load raw corpus dataframe
with open('data/raw_corpus.pkl', 'rb') as f:
    corpus_df = pkl.load(f)

In [7]:
# Pull topics
topics = lda_model.show_topics(formatted=False, num_topics=nTopics, num_words=20)

In [8]:
# Define topic names
topic_names = ['Reservoir Modeling',
               'Quantitative Methods',
               'Water Quality Monitoring',
               'Water Resource Management',
               'Climate Change',
               'Transport & Tracers',
               'Sediment and Erosion',
               'Land Surface',
               'Hydrogeology',
               'Soil Moisture',
               'Snow Hydrology',
               'Water Quality Remediation',
               'Groundwater',
               'Channel Flow',
               'Vadose Zone',
               'Watershed Hydrology',
               'Modeling',
               '????',
               'Biochemical',
               'Floods',
               'Pollutant runoff'
              ]

# Create WordClouds and Trend Plots

In [9]:
# calculate time series
years = np.unique(corpus_df['Year'])
topic_distributions_by_year = np.zeros([len(years), lda_model.num_topics])
for y, year in enumerate(years):
    hm_year = topic_distributions[corpus_df['Year'] == year,:]
    topic_distributions_by_year[y,:] = np.sum(hm_year, axis=0) / np.sum(hm_year)

In [10]:
# create wordclouds
stop_words = stopwords.words('english')
cloud = WordCloud(stopwords=stop_words,
                  background_color='white',
                  width=3500,
                  height=3500,
                  max_words=15,
                  color_func=lambda *args, **kwargs: colorlist[t],
                  prefer_horizontal=1.0)

In [None]:
# Plot wordclouds and scaled time series
fig = plt.figure(figsize=(12, 3*nTopics))
gs = fig.add_gridspec(nTopics,3)
for t in range(nTopics):
    
    # plot time series
    ax1 = fig.add_subplot(gs[t, 1:])
    ax1.plot(years, topic_distributions_by_year[:,t], color=list(custom_colors.values())[t], linewidth=8)
    ax1.set_title(topic_names[t])
    ax1.set_xticks(years[4::5])
    ax1.set_ylabel('Popularity')
    ax1.grid()
    
    # plot wordclouds
    ax0 = fig.add_subplot(gs[t, 0])
    topic_words = dict(topics[t][1])
    cloud.generate_from_frequencies(topic_words, max_font_size=1500)
    ax0.imshow(cloud)
    ax0.axis('off')
    
fig.tight_layout()

# Save figure
plt.savefig('figures/wordclouds_and_trends.png')

In [None]:
# Plot relative time series
fig, ax = plt.figure(figsize=(10, 5), dpi=80, facecolor='w', edgecolor='k')

for i in range(lda_model.num_topics):
    r, g, b = tableau20[i]  
    pltcolor = (r / 255., g / 255., b / 255.)
    ax.plot(years, popularity_all_journals[:,i], 
             color = pltcolor, linewidth=3, label=labels[i])
    
ax.set_xlim([1,47])
ax.set_xticks(years[5::5], fontsize=14)
ax.set_yticks(fontsize=14)
# ax.set_xlabel('Year of Publication', fontsize=20)
ax.legend()
ax.set_ylabel('Popularity', fontsize=20)
ax.set_title('Relative Popularity of Topics', fontsize=28)
ax.grid()

# Save figure
plt.savefig('figures/relative_topic_trends.png')