# Load Data.

In [1]:
# Load data.
import pickle

if 1:
    with open('OS_all_M_T_title.p','rb') as f:
        data = pickle.load(f)
    M_OS = data[0]
    T_OS = data[1]
    OS_titles = data[2]
    corpus_category = data[3]
    assert len(corpus_category)==len(OS_titles)
    #print("\n".join(OS_titles[:10]))
    
    corpus_type = ['PHYS','CHEM','BIOL']
    for i in range(len(OS_titles)):
        OS_titles[i] = corpus_type[int(corpus_category[i])] + ' ' + OS_titles[i]

import numpy as np
import matplotlib.pyplot as plt

# Build Dataset with Good Sequences (for training).
Because textbook is written so that it is read somewhat sequentially, we can assume sections 1, 2, 3 are followed by 4.  Hence, we can create a good/reasonable sequences by taking sequential sections from each chapter.  Bad sequences would be something chosen randomly.  

Furthermore, the ordering of the feature does not have any meaning.  What might be more meaningful is how tf-idf values of the largest change over the next few sections.  In other words, the sorted values (sorted identically across sequence) would be more meaningful.  The functions (srot_matrix and package_seq) re-organizes/re-orders each sequence.

The good and bad sequences can be used to train/validate RNN.

In [4]:
# Make a list of good sequences, using section numbers.
# Assume that the first three sections are good sequences.

#print(OS_titles)

good_seq = list()
for i, title in enumerate(OS_titles):
    # For each chapter, take the first few sections as good sequences.
    split_token = title.split('.')
    chapter_num = split_token[0]
    section_num = int(split_token[1][0])
    title_str = split_token[1][2:]
    if section_num==1:
        good_seq.append((i,i+1,i+2))

# Examine. 
#for i in good_seq:
#    print(OS_titles[i[0]])
#    print(OS_titles[i[1]])
#    print(OS_titles[i[2]])

# Set up a bad seq.
# Assume sections that are far away are likely be bad sequences.
num_seq = len(good_seq)
bad_seq = list()
min_diff = len(good_seq)*0.25
while len(bad_seq) < num_seq:
    permlist = np.random.permutation(range(len(OS_titles)))
    seq = permlist[0:3]
    if (abs(seq[0]-seq[1])>min_diff) and (abs(seq[1]-seq[2])>min_diff) and (abs(seq[2]-seq[0])>min_diff):
        bad_seq.append(seq)
#print(bad_seq)

print("Length of bad and good sequences")
print(len(bad_seq))
print(len(good_seq))

Length of bad and good sequences
112
112


In [None]:
def sort_matrix (M):
    # Resort the data matrix (Nsample x Nfeature),
    # in the descending oder of the feature values 
    # of the first row/sample.
    sortidx = np.argsort(-M[0,:])
    return M[:,sortidx]

def package_seq(seq,M):
    Mseq = list()
    for each_seq in seq:
        each_M = M[each_seq,:]
        each_M = sort_matrix(each_M)
        Mseq.append(each_M)
    return Mseq

def barplot_group(M):
    # M = num groups x num bars
    Ngroup, Nbar = M.shape

    barwidth = 1/(Ngroup+2)
    for i in range(Ngroup):
        r = np.arange(Nbar)+i*barwidth
        plt.bar(r,M[i,:],width=barwidth,edgecolor='white')
    plt.xlabel('group', fontweight='bold')
    plt.xticks(range(Nbar),labels=''*Nbar)
    plt.show()

# Example bar plot
#bars1 = [12, 30, 1, 8, 22]
#bars2 = [28, 6, 16, 5, 10]
#bars3 = [29, 3, 24, 25, 17]
#bars = np.array([bars1,bars2,bars3])
#barplot_group(bars)


Mseq_g = package_seq(good_seq,M)
Mseq_b = package_seq(bad_seq,M)

barplot_group(sort_matrix(Mseq_b[0][:,:10]))
print(Mseq_g[0].shape)


In [9]:
# Key functions for generating curriculum.

def find_pdist (M):
    # M is number of points (row) x dimensions (column)
    pdist = np.sqrt(np.sum((M[None, :] - M[:, None])**2, -1))
    return pdist


# Explore data.

Perform initial data exploration.