# Summarizing Results 

In this notebook we will summarize the results of the unsupervised LDA topic models. We will review the coherence scores and determine whether we need to train additional models. Finally, we will select results to visualize and summarize. 

**Print Ranked Coherence for Each Corpus:**

In [18]:
import pandas as pd 
import glob 

# Get all the files with ranked coherence 
ranked_coherence_files = glob.glob('D:/Student_Voices_Database/s3mirror/ranked_coherence_*.csv')
ranked_coherence = pd.DataFrame() 
for file in ranked_coherence_files:
    dta = pd.read_csv(file)
    ranked_coherence = ranked_coherence.append(dta)   # append all the ranked coherence data into one file 

In [32]:
from IPython.display import HTML, display_html

# Show the top 10 models by average coherence for each corpus (reivew score range)
top_n = 10 
for rng in list(ranked_coherence['range'].unique()):
    print('The top '+str(top_n)+' ave coherence scores for range '+str(rng))
    display_html(HTML(ranked_coherence[ranked_coherence['range']==rng].sort_values('ave_coherence_score', ascending=False).head(top_n).to_html()))

The top 10 ave coherence scores for range [0, 35)


Unnamed: 0.1,Unnamed: 0,num_topics,ave_coherence_score,range,setting,config
0,0,27,0.528659,"[0, 35)",LDA1,D1
0,0,27,0.521769,"[0, 35)",LDA2,D1
0,0,21,0.520367,"[0, 35)",LDA3,D1
1,1,24,0.520062,"[0, 35)",LDA1,D1
1,1,27,0.516046,"[0, 35)",LDA3,D1
1,1,21,0.51466,"[0, 35)",LDA2,D1
2,2,24,0.514549,"[0, 35)",LDA2,D1
3,3,18,0.513267,"[0, 35)",LDA2,D1
2,2,18,0.513068,"[0, 35)",LDA3,D1
2,2,21,0.512183,"[0, 35)",LDA1,D1


The top 10 ave coherence scores for range [35, 60)


Unnamed: 0.1,Unnamed: 0,num_topics,ave_coherence_score,range,setting,config
9,0,27,0.503327,"[35, 60)",LDA1,D1
10,1,24,0.502673,"[35, 60)",LDA1,D1
9,0,27,0.501724,"[35, 60)",LDA2,D1
11,2,18,0.498075,"[35, 60)",LDA1,D1
9,0,24,0.497871,"[35, 60)",LDA4,D1
10,1,21,0.497436,"[35, 60)",LDA2,D1
10,1,18,0.495984,"[35, 60)",LDA4,D1
9,0,18,0.494494,"[35, 60)",LDA3,D1
11,2,24,0.493676,"[35, 60)",LDA2,D1
12,3,18,0.49181,"[35, 60)",LDA2,D1


The top 10 ave coherence scores for range [60, 65)


Unnamed: 0.1,Unnamed: 0,num_topics,ave_coherence_score,range,setting,config
18,0,21,0.480373,"[60, 65)",LDA2,D1
18,0,24,0.47547,"[60, 65)",LDA1,D1
19,1,27,0.470289,"[60, 65)",LDA2,D1
18,0,21,0.467879,"[60, 65)",LDA1,A1
20,2,24,0.467842,"[60, 65)",LDA2,D1
19,1,15,0.46457,"[60, 65)",LDA1,D1
20,2,27,0.463114,"[60, 65)",LDA1,D1
21,3,21,0.46301,"[60, 65)",LDA1,D1
19,1,18,0.462431,"[60, 65)",LDA1,A1
20,2,27,0.462167,"[60, 65)",LDA1,A1


The top 10 ave coherence scores for range [65, 75)


Unnamed: 0.1,Unnamed: 0,num_topics,ave_coherence_score,range,setting,config
27,0,18,0.504137,"[65, 75)",LDA2,D1
28,1,21,0.49601,"[65, 75)",LDA2,D1
29,2,15,0.494011,"[65, 75)",LDA2,D1
30,3,24,0.487281,"[65, 75)",LDA2,D1
27,0,27,0.487052,"[65, 75)",LDA1,D1
28,1,24,0.486643,"[65, 75)",LDA1,D1
27,0,21,0.485034,"[65, 75)",LDA2,A1
31,4,12,0.482225,"[65, 75)",LDA2,D1
27,0,27,0.481481,"[65, 75)",LDA1,B1
27,0,18,0.480094,"[65, 75)",LDA3,D1


The top 10 ave coherence scores for range [75, 85)


Unnamed: 0.1,Unnamed: 0,num_topics,ave_coherence_score,range,setting,config
36,0,18,0.482798,"[75, 85)",LDA1,D1
36,0,21,0.474205,"[75, 85)",LDA3,D1
37,1,27,0.474116,"[75, 85)",LDA1,D1
36,0,24,0.472662,"[75, 85)",LDA1,A1
38,2,21,0.471583,"[75, 85)",LDA1,D1
36,0,21,0.470713,"[75, 85)",LDA2,D1
37,1,18,0.469161,"[75, 85)",LDA1,A1
39,3,15,0.467685,"[75, 85)",LDA1,D1
36,0,21,0.467669,"[75, 85)",LDA1,B1
40,4,24,0.465394,"[75, 85)",LDA1,D1


The top 10 ave coherence scores for range [85, 95)


Unnamed: 0.1,Unnamed: 0,num_topics,ave_coherence_score,range,setting,config
45,0,27,0.521133,"[85, 95)",LDA1,D1
46,1,24,0.520253,"[85, 95)",LDA1,D1
47,2,21,0.512559,"[85, 95)",LDA1,D1
48,3,18,0.509295,"[85, 95)",LDA1,D1
45,0,27,0.504825,"[85, 95)",LDA3,D1
45,0,27,0.504288,"[85, 95)",LDA2,A1
45,0,21,0.503739,"[85, 95)",LDA2,D1
46,1,27,0.502719,"[85, 95)",LDA2,D1
47,2,18,0.501963,"[85, 95)",LDA2,D1
46,1,15,0.500625,"[85, 95)",LDA3,D1


The top 10 ave coherence scores for range [95, 101)


Unnamed: 0.1,Unnamed: 0,num_topics,ave_coherence_score,range,setting,config
54,0,24,0.527441,"[95, 101)",LDA1,D1
54,0,24,0.514978,"[95, 101)",LDA3,D1
55,1,21,0.513268,"[95, 101)",LDA1,D1
56,2,18,0.505359,"[95, 101)",LDA1,D1
57,3,27,0.504959,"[95, 101)",LDA1,D1
54,0,24,0.493858,"[95, 101)",LDA2,D1
58,4,15,0.493025,"[95, 101)",LDA1,D1
55,1,27,0.491292,"[95, 101)",LDA3,D1
56,2,18,0.489749,"[95, 101)",LDA3,D1
57,3,21,0.489618,"[95, 101)",LDA3,D1


### Looking at Coherence Scores in More Detail

Here we take a look at the coherence scores for each topic in each analysis. With this we can review coherence based on more than just average topic coherence. 

**Append all the coherence scores for review**:

In [100]:
%%time

import re 
import numpy as np

# Get all the files with ranked coherence 
full_coherence_files = glob.glob('D:/Student_Voices_Database/s3mirror/full_coherence_*.pbz2')

coherence_thresholds = [0.5,0.54,0.59,0.64,0.69]

data = {} 
data['Range'] = [] 
data['Setting'] = [] 
data['Config'] = []
data['N_Topics'] = []
data['CS_Ave'] = [] 
data['CS_Med'] = []
for ct in coherence_thresholds: data['CS_'+str(ct)]=[]

for file in full_coherence_files: 
    dta = bn.decompress_pickle(file)
    ranges = dta.keys() 
    setting = re.findall('LDA[0-9]', file)[0]
    config = re.findall('_([A-Z][0-9])\.', file)[0]

    for rng in list(dta.keys()):
        for tn, css in dta[rng][config][setting]:
            scores, ave_score = css
            data['Range'].append(str(rng))
            data['Setting'].append(setting)
            data['Config'].append(config)
            data['N_Topics'].append(tn)
            data['CS_Ave'].append(ave_score)
            data['CS_Med'].append(np.median(scores))
            for ct in coherence_thresholds: data['CS_'+str(ct)].append(sum((scores>ct).astype(int))/len(scores))
                
full_coherence_data = pd.DataFrame(data)

Wall time: 65.9 ms


In [118]:
dta.keys()

dict_keys(['[0, 35)', '[35, 60)', '[60, 65)', '[65, 75)', '[75, 85)', '[85, 95)', '[95, 101)'])

**Exploring topic coherence distribution**: 

Average topic coherence is one way to check our unsupervised models. Another is to check which proportion of the topics have coherence scores that suggest high topic coherence. This method is has not been approached in the literature but may help provide clarity on how to proceed. 

In [132]:
rng = '[95, 101)'
top_n = 5
ctd = ['CS_Ave', 'CS_Med', 'CS_0.5','CS_0.54', 'CS_0.59']

for c in ctd: 
    print("Sort by",c)
    disp = full_coherence_data[full_coherence_data['Range']==rng].sort_values(c, ascending=False)
    display_html(HTML(disp.head(top_n).to_html()))

Sort by CS_Ave


Unnamed: 0,Range,Setting,Config,N_Topics,CS_Ave,CS_Med,CS_0.5,CS_0.54,CS_0.59,CS_0.64,CS_0.69
187,"[95, 101)",LDA1,D1,24,0.527441,0.527383,0.666667,0.458333,0.25,0.041667,0.0
565,"[95, 101)",LDA3,D1,24,0.514978,0.512908,0.541667,0.375,0.166667,0.083333,0.041667
186,"[95, 101)",LDA1,D1,21,0.513268,0.503492,0.52381,0.333333,0.095238,0.095238,0.0
185,"[95, 101)",LDA1,D1,18,0.505359,0.482731,0.333333,0.166667,0.166667,0.111111,0.0
188,"[95, 101)",LDA1,D1,27,0.504959,0.493085,0.407407,0.296296,0.148148,0.074074,0.0


Sort by CS_Med


Unnamed: 0,Range,Setting,Config,N_Topics,CS_Ave,CS_Med,CS_0.5,CS_0.54,CS_0.59,CS_0.64,CS_0.69
187,"[95, 101)",LDA1,D1,24,0.527441,0.527383,0.666667,0.458333,0.25,0.041667,0.0
565,"[95, 101)",LDA3,D1,24,0.514978,0.512908,0.541667,0.375,0.166667,0.083333,0.041667
186,"[95, 101)",LDA1,D1,21,0.513268,0.503492,0.52381,0.333333,0.095238,0.095238,0.0
188,"[95, 101)",LDA1,D1,27,0.504959,0.493085,0.407407,0.296296,0.148148,0.074074,0.0
564,"[95, 101)",LDA3,D1,21,0.489618,0.487389,0.380952,0.190476,0.095238,0.095238,0.047619


Sort by CS_0.5


Unnamed: 0,Range,Setting,Config,N_Topics,CS_Ave,CS_Med,CS_0.5,CS_0.54,CS_0.59,CS_0.64,CS_0.69
187,"[95, 101)",LDA1,D1,24,0.527441,0.527383,0.666667,0.458333,0.25,0.041667,0.0
565,"[95, 101)",LDA3,D1,24,0.514978,0.512908,0.541667,0.375,0.166667,0.083333,0.041667
186,"[95, 101)",LDA1,D1,21,0.513268,0.503492,0.52381,0.333333,0.095238,0.095238,0.0
377,"[95, 101)",LDA2,D1,27,0.486953,0.480745,0.481481,0.259259,0.111111,0.037037,0.0
566,"[95, 101)",LDA3,D1,27,0.491292,0.480665,0.444444,0.222222,0.111111,0.111111,0.037037


Sort by CS_0.54


Unnamed: 0,Range,Setting,Config,N_Topics,CS_Ave,CS_Med,CS_0.5,CS_0.54,CS_0.59,CS_0.64,CS_0.69
187,"[95, 101)",LDA1,D1,24,0.527441,0.527383,0.666667,0.458333,0.25,0.041667,0.0
565,"[95, 101)",LDA3,D1,24,0.514978,0.512908,0.541667,0.375,0.166667,0.083333,0.041667
186,"[95, 101)",LDA1,D1,21,0.513268,0.503492,0.52381,0.333333,0.095238,0.095238,0.0
188,"[95, 101)",LDA1,D1,27,0.504959,0.493085,0.407407,0.296296,0.148148,0.074074,0.0
376,"[95, 101)",LDA2,D1,24,0.493858,0.475367,0.375,0.291667,0.125,0.0,0.0


Sort by CS_0.59


Unnamed: 0,Range,Setting,Config,N_Topics,CS_Ave,CS_Med,CS_0.5,CS_0.54,CS_0.59,CS_0.64,CS_0.69
187,"[95, 101)",LDA1,D1,24,0.527441,0.527383,0.666667,0.458333,0.25,0.041667,0.0
182,"[95, 101)",LDA1,D1,9,0.475901,0.457418,0.222222,0.222222,0.222222,0.0,0.0
181,"[95, 101)",LDA1,D1,6,0.448557,0.435453,0.166667,0.166667,0.166667,0.0,0.0
185,"[95, 101)",LDA1,D1,18,0.505359,0.482731,0.333333,0.166667,0.166667,0.111111,0.0
565,"[95, 101)",LDA3,D1,24,0.514978,0.512908,0.541667,0.375,0.166667,0.083333,0.041667


In [140]:
run_1 = list(range(3,30,3))

In [142]:
[i for i in list(range(16,27)) if i not in run_1]

[16, 17, 19, 20, 22, 23, 25, 26]