In [None]:
#importing libraries
import gensim
import pickle
import numpy
import matplotlib.pyplot as plt
import gensim.models.doc2vec as d2v
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans


In [None]:
f = r'ufSeg_dm.model'
model = d2v.Doc2Vec.load(f)
segments_matrix = model.docvecs.doctag_syn0 #get 110,701 vectors as a matrix
segments_matrix.shape

In [None]:
pca = PCA(n_components=2) 
X = pca.fit_transform(segments_matrix)

kmeans = KMeans(n_clusters=60)
kmeans.fit(X)
y_kmeans = kmeans.predict(X) #vectors assigned a number 1-30
centers = kmeans.cluster_centers_

# Percentage of variance explained for each components
print('explained variance ratio (first two components): %s'
      % str(pca.explained_variance_ratio_))

In [None]:
fig = plt.figure()

plt.scatter(X[:,0], X[:,1], c = y_kmeans) # c = color, plotting all points in X
plt.scatter(centers[:, 0], centers[:, 1], c='black', s=5, alpha=0.5); # plotting centroids for each cluster

plt.legend(loc='best', shadow=False, scatterpoints=1)
plt.title('PCA of UF d2v segments with K-Means')
plt.show()


In [None]:
# pickle object: list of tuples which each contain info on individual segments
# (book.title, book.code, book.date, toks, seg_pos)
inp = r'premodel/d2v_UFSeg.txt' 
objs = pickle.load(open(inp, 'rb'))
objs_new = []

code_list = [x[1] for x in objs]
from collections import Counter
seg_count_for_book = Counter(code_list) #returns a dictionary of unique code: # of segments

for obj in objs:
    lst = list(obj) # convert tuple into a list
    objs_new.append(lst + [seg_count_for_book[obj[1]]]) # (book.title, book.code, book.date, toks, seg_pos, seg_count)


In [None]:
import pandas as pd 
all_segments = []

for i in range(len(objs_new)):
    segment = objs_new[i]
    title = segment[0]
    uf_id = segment[1]
    date = segment[2]
    text = segment[3]
    seg_pos = segment[4]
    rel_pos = seg_pos/segment[5] # progress in book
    cluster = y_kmeans[i] # i-th segment correlates to i-th cluster assignment
    all_segments.append((uf_id, cluster, seg_pos, rel_pos, title, date, text))
    
labels = ['uf_id', 'cluster', 'seg_pos', 'rel_pos', 'title', 'date', 'text']

df = pd.DataFrame.from_records(all_segments, columns = labels)
small_df = df[['cluster', 'text','rel_pos']]
#small_df.sample(20)
small_df[small_df['cluster'] == 0].sample(30)



    

In [None]:
#df.loc[df['cluster'] == 3]['rel_pos'].mean()
#df.loc[df['cluster'] == 3]['rel_pos'].median()
#df.groupby('cluster').mean()



In [None]:
outpath = 'uf_kmeans_clusters_thru_book/'

# plot density of all 30 clusters vs relative position in book (aggregate)
# track peaks and troughs of each cluster

for x in range(60):
    y = df.loc[df['cluster'] == x]['rel_pos']
    plt.figure()
    plt.xlim(0,1)
    plt.xlabel('relative position in book (aggregate)')
    plt.title(x)
    #y.plot.hist(bins = 100)
    y.plot.kde()
    #plt.savefig(outpath +'cluster_{}_across_book'.format(x))

    


In [None]:
# add decade column
def getDecade(year):
    return year[:-1] + '0'
df['decade'] = df['date'].astype('category')
df['decade'] = df['decade'].apply(getDecade)
df


In [None]:
outpath = 'uf_kmeans_clusters_thru_time/'

# plot histogram of clusters across publication dates

datetime = pd.DatetimeIndex([str(x) for x in df.date]) #convert year to datetime objects
df['date'] = datetime

for x in range(29,34):
    y = df.loc[df['cluster'] == x]['date']
    plt.figure()

    plt.title('Frequency of cluster ' + str(x) +' across time')
    y.hist(bins = 20)
    #plt.savefig(outpath +'cluster_{}_across_time'.format(x))
    



In [None]:
f = 'uf_kmeans_clusters_thru_time_segments/'
def saveSegmentDataToCSV(outfolder):
    for i in range(30):
        outf = 'cluster_' + str(i) + '_dataframe.csv'
        cluster_frame = df[df['cluster'] == i]
        cluster_frame.to_csv(outfolder + outf)
#saveSegmentDataToCSV(f)


In [None]:
df[df['cluster'] == 60]['text'].sample(40)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from operator import itemgetter
cluster_texts = []
for i in range(60):
    cluster_text = ''
    col = df[df['cluster'] == i]['text']
    for row in col:
        cluster_text += ' '.join(row) + ' '
    cluster_texts.append(cluster_text)

tf = TfidfVectorizer(smooth_idf=False, norm=None, analyzer='word', max_df=0.95, min_df=.1)
txt_fitted = tf.fit(cluster_texts)
tfidf_matrix = txt_fitted.transform(cluster_texts)

feature_names = tf.get_feature_names()

doc = 4
feature_index = tfidf_matrix[doc,:].nonzero()[1]
tfidf_scores = zip(feature_index, [tfidf_matrix[doc, x] for x in feature_index])

for w, s in [(feature_names[i], s) for (i, s) in sorted(tfidf_scores, key = itemgetter(1), reverse =True)]:
  print(w,s)
        

In [None]:
#for x in unique_codes:
    #df[df['hathi_id'] == x]
book_props = df.groupby(['hathi_id', 'cluster']).agg({'rel_pos': 'sum'})
# Change: groupby state_office and divide by sum
x = book_props.groupby('hathi_id').apply(lambda x:100 * x / float(x.sum()))
for i in unique_codes:
    y = df[df['hathi_id'] == i]['cluster']
    y.plot.pie()

#df.groupby(['hathi_id','cluster']).agg('count')


In [None]:
gr = df.groupby(['hathi_id', 'cluster']).size()
gr.groupby(level=0).sum()
gr = gr / gr.groupby(level=0).sum()
gr['hathi_id']