## Preprocess

In [None]:
%run 04-preprocessing.ipynb

In [None]:
ignore_words = [
    "finding", "research", "purpose", "study", "methodology", "result", 
    "analysis", "method", "paper", "literature", "innovation",
    "also", "within", "whereas", "would", "br", "elsevier", "data"
]

In [None]:
data = pd.read_json(open('data/scopus_data_2019-09-05-093052.json'), encoding='utf-8')

documents_tokens, data_preprocessed, bigram_lexicon = preprocess(data, to_ignore=ignore_words, save_to_file='tmp_preprocessed.csv')
print('Documents kept after preprocessing: {}'.format(len(documents_tokens)))

with open('Preprocess/tokens.pkl', 'wb') as handle:
    pickle.dump(documents_tokens, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Basic statistics

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

### Word Frequencies

Find most frequent words in the corpus.

In [None]:
fdist = word_frequencies(documents_tokens)

Change `show_top_n` to adjust the number of words to display.

In [None]:
show_top_n = 30

# calc
mc = fdist.most_common(show_top_n)

# plot
mc = mc[::-1] ; ws, fs = zip(*mc) ; ws = ['{} ({})'.format(w, f) for w, f in mc]
plt.figure(figsize=(3, 6*show_top_n/30))
plt.box(False); plt.tick_params(top=False, bottom=True, left=False, right=False, labelleft=True, labelbottom=True)
plt.barh(range(len(ws)), fs) ; plt.yticks(range(len(ws)), ws); plt.ylim(-1, len(ws));

### Context of words

Find common contexts (co-occurring words) where the words from the list appear.

In [None]:
word_list = ['hotel', 'technology']

context = word_contexts(documents_tokens, word_list, num=20)

## Topic discovery

In [None]:
%run "Topic Modeling/pyldavis.ipynb"

### Show topics and most frequently used words in each topic
Set `num_topics` to the expected number of topics in the corpus.

Set `num_words` to control the number of ***most frequent*** words listed for each topic.

In [None]:
num_topics = 7
num_words = 10

corpus, dictionary, ldamodel = pyldavis_prep(documents_tokens, num_topics=num_topics, num_words=num_words)

### Interactive visualization of topics

The visualization shows topics as circles in a 2D plot. This is an approximation of topic similarity. The more similar two topics are, the closer they will be in the plot. The size of the circle corresponds to the presence of the topic in the corpus.

The visualization also shows the top 30 ***most relevant*** terms (words) for each topic. If a word is frequent in a topic, but also in the entire corpus, it will get a lower relevance score than a word that is frequent in a topic alone. 

***Relevance*** of a word in a topic is a weighted measure of the word probability within the topic and the word lift (the ratio of the word probability within the topic to its probability in the entire corpus). 

***Saliency*** refers to the importance of each word for a topic.


How to interact with the visualization:
1. Select a topic by clicking on a circle in the plot or by selecting a topic number in the control area at the top.

2. On the right, you see the most relevant terms for the selected topic. Adjust the relevance slider. For `lambda = 0` it is equal to the lift of the word, for `lambda = 1` it is equal to probability of the word within the topic.

3. If you click on a word in the histogram on the right, topic circles will resize according to the ***saliency*** of the term in the topic.

In [None]:
pyldavis_vis(corpus, dictionary, ldamodel, save_to_html='tmp.html')

## Cluster documents and topics

Add topic vectors and generate a clustering of documents.

In [None]:
%run "Topic Modeling/visualization.ipynb"

In [None]:
topic_vectors = add_topics_vector(corpus, ldamodel)
data_preprocessed_vectors = pd.concat([data_preprocessed, topic_vectors], axis=1)
data_preprocessed_vectors.to_csv('tmp_preprocessed_vectors.csv', index=False)

Visualize with heatmaps.

In [None]:
hm, cm = visualize(topic_vectors)

Visualize by time.

In [None]:
dominant_topic = get_dominant_topic(ldamodel, corpus, data)
dominant_topic

In [None]:
representative_docs = get_representative_doc(dominant_topic)
representative_docs

In [None]:
topic_distribution = get_topic_distribution(dominant_topic, representative_docs)
topic_distribution

## Time-based visualizations

Visualizing which topics were the most popular throughout the years. We first plot the distribution through time of the topics generated on the whole database. 

In [None]:
import plotly.graph_objects as go
import plotly.express as px

In [None]:
totals = []
means = []
x = []

start = int(data['Date'].min())
end = int(data['Date'].max())

for year in range(start, end):
    indices = data.index[data['Date'] == year].tolist()
    
    total = len(indices)

    year_dominant_topic = dominant_topic[dominant_topic.index.isin(indices)]
    topic_count = year_dominant_topic['Dominant_Topic'].value_counts()
    
    t = []
    m = []
    
    for i in range(num_topics):
        try:
            t.append(topic_count[i])
            m.append(topic_count[i] / total)
        except:
            t.append(0)
            m.append(0)
        
    totals.append(t)
    means.append(m)
    x.append(year)
    
totals = np.array(totals).transpose()
means = np.array(means).transpose()

Create a list containing topic keywords for the vizualisation.

In [None]:
topic_text = []

for i in range(num_topics):
    words = []
    topic = ldamodel.show_topic(i)
    for j in range(len(topic)):
        words.append(topic[j][0])
        
    topic_text.append(words)

In [None]:
fig = go.Figure()

for i in range(len(totals)):
    fig.add_trace(go.Scatter(x = x, y=totals[i],
                    mode='lines',
                    name=str(topic_text[i])))
    
#fig.update_layout(width=1200, height=500)

fig.update_layout(
    title=go.layout.Title(
        text="Total number of documents by topic",
        font=dict(size=22)
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text="Year",
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text="Number of documents",
        )
    ),
    legend=go.layout.Legend(
        x=0,
        y=-0.8,
        font=dict(
            family="sans-serif",
            size=12,
            color="black"
        )
    )
)
    
fig.show()

In [None]:
fig = go.Figure()

for i in range(len(totals)):
    fig.add_trace(go.Scatter(x = x, y=means[i],
                    mode='lines',
                    name=str(topic_text[i])))
    
#fig.update_layout(width=1200, height=500)

fig.update_layout(
    title=go.layout.Title(
        text="Percentage of documents representing a topic in a year",
        font=dict(size=22)
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text="Year",
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text="Percentage",
        )
    ),
    legend=go.layout.Legend(
        x=0,
        y=-0.8,
        font=dict(
            family="sans-serif",
            size=12,
            color="black"
        )
    )
)
    
fig.show()

What would be more interesting to see is, how are topics generated in a smaller time period represented in the whole database. For that we need a function that finds topics for an unseen document or a group of them.

`find_doc_topic` calculates the weights of the topics for every document in the dataframe `preprocessed_data`. We can change the number of topics (`no_outputs`) we want to return, the default value is only one topic, which is the one the model evaluated as the best. If we want the function to return only topics, which have a score higher than some value, we can set that value with the `threshold` parameter.

In [None]:
def find_doc_topic(ldamodel, dictionary, preprocessed_data, no_outputs=1, threshold=None):
    
    new_df = preprocessed_data.filter(['Title', 'Abstract', 'Date'], axis=1)
    
    topic_vectors = []
    dominant_topics = []
    
    # Create a set of tokens used in the smaller model. Using intersection is faster than comparing lists
    topic_tokens = set()
    for i in range(len(dictionary)):
        topic_tokens.add(dictionary[i])
        
    # Remove words which are not included in the topic_model corpus
    for i, row in preprocessed_data.iterrows():
        row['tokens'] = topic_tokens.intersection(set(row['tokens']))
        
    # create corpus for new documents with the ldamodel dictionary
    corpus = [dictionary.doc2bow(text) for text in preprocessed_data['tokens']]
    
    for i, row in enumerate(ldamodel[corpus]):
        arr = []
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        for j in range(min(no_outputs, len(row))):
            arr.append(row[j])
            
        topic_vectors.append(arr)
        dominant_topics.append(arr[0])
        
    if no_outputs == 1 and threshold is not None:
        for i in range(len(dominant_topics)):
            if dominant_topics[i][1] < threshold:
                dominant_topics[i] = (None, None)                

    df = pd.DataFrame((dominant_topics), columns =['Topic', 'Topic_Weigth'])
    topic_df = pd.DataFrame([topic_vectors])
    topic_df = topic_df.T
    topic_df.columns = ['Topic_Vectors']
    df = pd.concat([df, new_df, topic_df], axis=1)
    
    
    return df, topic_vectors

First we need to select the time period which will be used to generate topics. Then we select the data from the database, which falls into the selected time period and generate the topics with the LDA model.

In [None]:
startYear = 1995
endYear = 2000

topic_data = data.loc[(data['Date'] >= startYear) & (data['Date'] <= endYear)]

topic_documents_tokens, topic_data_preprocessed, topic_bigram_lexicon = preprocess(topic_data, to_ignore=ignore_words)

topic_corpus, topic_dictionary, topic_ldamodel = pyldavis_prep(topic_documents_tokens, num_topics=8, num_words=num_words)

topic_vectors = add_topics_vector(topic_corpus, topic_ldamodel)
data_preprocessed_vectors = pd.concat([data_preprocessed, topic_vectors], axis=1)

with open('Preprocess/tokens_period.pkl', 'wb') as handle:
    pickle.dump(topic_documents_tokens, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
dataFrame, topics = find_doc_topic(topic_ldamodel, topic_dictionary, data_preprocessed, no_outputs=num_topics)
year_groups = dataFrame.groupby(['Date', 'Topic']).size().reset_index(name='counts')
count = dict(data.groupby('Date').size())

totals = []
means = []

start = 1995
stop = 2018

for year in range(start, stop + 1):
    year_group = year_groups[year_groups['Date'] == year]
    
    t = []
    m = []
    
    for i in range(8):
        if i in list(year_group['Topic']):
            t.append(int(year_group[year_group['Topic'] == i]['counts']))
            m.append(int(year_group[year_group['Topic'] == i]['counts']) / count[year])
        else:
            t.append(0)
            m.append(0)
            
    totals.append(t)
    means.append(m)

totals = np.array(totals).transpose()
means = np.array(means).transpose()

Get descriptions of the topic, for the graph labels.

In [None]:
topic_text_small = []

for i in range(8):
    words = []
    topic = topic_ldamodel.show_topic(i)
    for j in range(len(topic)):
        words.append(topic[j][0])
        
    topic_text_small.append(words)

The highlited zone on the plot shows the time period the data was taken from to generate the topics.

In [None]:
fig = go.Figure()

for i in range(len(totals)):
    fig.add_trace(go.Scatter(x = list(range(start, stop+1)), y=means[i],
                    mode='lines',
                    name=str(topic_text_small[i])))
    
fig.update_layout(legend=dict(x=0, y=-0.7))
#fig.update_layout(showlegend=False)

fig.update_layout(
    title=go.layout.Title(
        text="Distribution of a topic over the documents through years",
    ),
    xaxis=go.layout.XAxis(
        title=go.layout.xaxis.Title(
            text="Year",
        )
    ),
    yaxis=go.layout.YAxis(
        title=go.layout.yaxis.Title(
            text="Percentage of documents",
        )
    )
)

fig.update_layout(
    shapes=[
        go.layout.Shape(
            type="rect",
            # x-reference is assigned to the x-values
            xref="x",
            # y-reference is assigned to the plot paper [0,1]
            yref="paper",
            x0=startYear,
            y0=0,
            x1=endYear,
            y1=1,
            fillcolor="LightSalmon",
            opacity=0.5,
            layer="below",
            line_width=0,
        )
    ]
)

#fig.update_layout(width=1200, height=600)

fig.show()

## Authors
We have now discovered the research topics. We would like to know which authors are publishing the articles with certain topics.

In [None]:
import operator

def get_topic_authors(data, dominant_topic):
    df = pd.concat([dominant_topic['Dominant_Topic'], data['Authors']], axis=1)
    
    topic_authors = []
    
    for i in range(num_topics):
        group = df[df['Dominant_Topic'] == i]
        group = group['Authors'].tolist()
        authors = dict()
        for author_list in group:
            for author in author_list:
                if author in authors.keys():
                    authors[author] += 1
                else:
                    authors[author] = 1
                    
        authors = dict((k, v) for k, v in authors.items() if v >= 10)            
        sorted_authors = sorted(authors.items(), key=operator.itemgetter(1), reverse=True)
                    
        topic_authors.append(sorted_authors)
    
    
    return topic_authors

In [None]:
dominant_topic = get_dominant_topic(ldamodel, corpus, data)

topic_text = []

for i in range(num_topics):
    words = []
    topic = ldamodel.show_topic(i)
    for j in range(len(topic)):
        words.append(topic[j][0])
        
    topic_text.append(words)

topic_authors = get_topic_authors(data, dominant_topic)

In [None]:
topic = 0

# calc
mc = topic_authors[topic]

# plot
mc = mc[::-1] ; ws, fs = zip(*mc) ; ws = ['{} ({})'.format(w, f) for w, f in mc]
fig = plt.figure(figsize=(3, len(mc)/2))
plt.box(False); plt.tick_params(top=False, bottom=True, left=False, right=False, labelleft=True, labelbottom=True)
plt.barh(range(len(ws)), fs) ; plt.yticks(range(len(ws)), ws); plt.ylim(-1, len(ws));
fig.suptitle("Number of articles published by author for theme:\n" + str(topic_text[topic]));