In [1]:
#all required imports
import numpy as np
import sklearn as skl
from sklearn import datasets
from sklearn import metrics
from sklearn import cluster
from sklearn.cluster import AgglomerativeClustering
import pickle
import pandas as pd

In [2]:
#load in the best version of the embeddings from Caleb's module
embeddings = np.load('sentence_embeddings_revision4_Trigram.numpy',allow_pickle =True)

#this gets all the unvectorized sentences without filler words
#
with open('./sentences.pickle', 'rb') as f:
    sentences = pickle.load(f)

#set seed for repeatable outcomes
np.random.seed(424242)

#select 1/8 of the embeddings randomly to cluster
sample_i = np.random.choice(embeddings.shape[0], 216928 // 8, replace=False)
sample_embeddings = embeddings[sample_i, :]

#cluster the embeddings with agglomerative clustering, 8 clusters and fit according to the random subset of embeddings
hac = AgglomerativeClustering(compute_full_tree=True, linkage='ward', n_clusters=8)
hac.fit(sample_embeddings)

# Get a mapping of label to the indices where it's at, effectively labels the embeddings into categories
i = 0
label_to_indices = {}
for label in hac.labels_:
    if label not in label_to_indices:
        label_to_indices[label] = [i]
    else:
        label_to_indices[label].append(i)
    
    i += 1

    
#label_to_indices

#each label 0-7, iterated through and printing a subest of that label which gives an indication of trivia categories
for label in sorted(label_to_indices.keys()):
    indices = label_to_indices[label]
    sentences_with_label = [sentences[sample_i[i]] for i in indices]
    print(label)
    for i in range(11, 30):
        print(sentences_with_label[i])
        
    print()
    print()

0
useful long term avian study placing metal identification tags legs wild birds banding
colorful ferrell anchorman harmonizes afternoon delight ron burgundy
harrelson yummies woody goodies
celtics star returned home state coach indiana pacers larry bird
wait visva bharati future goal congress party india madam indira gandhi
swiss engineer got idea burrs stuck clothes velcro
traditional number wise men east times number ali baba thieves
progressive tax rises rises income
picked player time hockey news wayne gretzky
expenses expense account cover pocket
penn jillette hosts nbc game impressions identity
mag called hunt fish essay contest hunting fishing influenced life field stream
jon clue crew explains types charts sprawling pedigree chart space saving ahnentafel meaning ancestor table identify ancestors number relative number maternal grandmother
throwing gutterball throw number pins row tenpin bowling
albert schweitzer person inspirational effect mother teresa
title homeric film sun 

In [3]:
#CLusters with 424242 seed
#0 - Celebrities
#1 - politics
#2 - geography
#3 - science
#4 - animals
#5 - food and drink
#6 - history
#7 - pop culture

In [4]:
np.random.seed(292510)
sample_i = np.random.choice(embeddings.shape[0], 216928 // 8, replace=False)

sample_embeddings = embeddings[sample_i, :]
hac = AgglomerativeClustering(compute_full_tree=True, linkage='ward', n_clusters=8)
hac.fit(sample_embeddings)

# Get a mapping of label to the indices where it's at
i = 0
label_to_indices = {}
for label in hac.labels_:
    if label not in label_to_indices:
        label_to_indices[label] = [i]
    else:
        label_to_indices[label].append(i)
    
    i += 1

    

In [5]:
#label_to_indices

for label in sorted(label_to_indices.keys()):
    indices = label_to_indices[label]
    sentences_with_label = [sentences[sample_i[i]] for i in indices]
    print(label)
    for i in range(11, 40):
        print(sentences_with_label[i])
        
    print()
    print()
    
for key, value in label_to_indices.items():
    print(key, len(value))

0
paraclete trinity holy spirit
nikolai gogol diary madman
author continued saga corleones novel sicilian mario puzo
old prayer asks deliverance ghoulies ghosties long leggedy beasties things bump night
title character nbc drama works massachusetts state coroner office crossing jordan
title angry wife plots revenge jason killing new flame poisoned garment medea
british chief justice proposed writer book lacked feature fined lose copyright index
oct home life improvement guru bundle ipo hit wall street martha stewart
daughter newspaper reporters served times florida state attorney big promotion janet reno
house savoy carignano shuttered country humbert left throne italy
john oates blond partner hall
queen hearts stole tarts knave hearts
alec baldwin played conductor title steam engine magic railroad thomas tank engine
jon clue crew reports lbj library museum austin lbj congressman enter active duty wwii library proudly displays silver star given pacific theater commander douglas macarth

In [6]:
#Clusters with seed 292510
#0 - Celebrities - largest category, least certain
#1 - Science
#2 - biology/ecology
#3 - food
#4 - the arts
#5 - geography
#6 - politics
#7 - sports

In [9]:
np.random.seed(112912)
sample_i = np.random.choice(embeddings.shape[0], 216928 // 8, replace=False)

sample_embeddings = embeddings[sample_i, :]
hac = AgglomerativeClustering(compute_full_tree=True, linkage='ward', n_clusters=8)
hac.fit(sample_embeddings)

# Get a mapping of label to the indices where it's at
i = 0
label_to_indices = {}
for label in hac.labels_:
    if label not in label_to_indices:
        label_to_indices[label] = [i]
    else:
        label_to_indices[label].append(i)
    
    i += 1

    
#label_to_indices

for label in sorted(label_to_indices.keys()):
    indices = label_to_indices[label]
    sentences_with_label = [sentences[sample_i[i]] for i in indices]
    print(label)
    for i in range(11, 30):
        print(sentences_with_label[i])
        
    print()
    print()
    
for key, value in label_to_indices.items():
    print(key, len(value))

0
unit originally intended earth circumference meter
near ground gas major smog higher altitudes helps block radiation sun ozone
according law electric current equal ratio voltage resistance georg ohm
marks anniversary sir frederick banting charles best discovery pancreatic hormone insulin
symbol colorless odorless gas atomic number krypton
french title jules verne classic terre lune earth moon
word preceding aforethought mean ill desire inflict harm malice
hasn iron contact contraction
fashion awards liar liar star modeled fig leaf said fashion began jim carrey
state tree sitka spruce alaska
japanese electronics company comes latin word sound sony
start counting sides shape seen nonagon
circle trunk reveals tree age ring
hornet nest said stunningly salty language probably delighted miss lillian jimmy carter
metabolize calcium strong bones iguanas need type radiation provided special light ultraviolet
michael jackson paul mccartney duetted waste time doggone doggone girl
association av

In [8]:
#Clusers with seed 112912
#0 - science
#1 - Geography
#2 - Hard to tell unfortunately
#3 - pop culture
#4 - food and drink
#5 - history
#6 - film (the arts)
#7 - Literature