In [None]:
import pandas as pd
from gensim.models import Word2Vec
from termcolor import colored
import warnings
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
plt.style.use("fivethirtyeight")
warnings.filterwarnings("ignore")
%matplotlib inline

## Helper functions & setup

In [None]:
#Using the Google News Model
#not pushed because it is too big but can be found here: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
model = Word2Vec.load_word2vec_format('../GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
#I will be using the first folder as my data
queries_file = open("../data/user_studies/all.txt", "r")
lines = queries_file.readlines()
lines = [l.strip() for l in lines]
# remove all strings with length < 3
queries = []
for i in lines:
    if not len(i)<3:
        queries.append(i)

In [None]:
#helper functions
def get_sentence_vector(q):
    """returns the vector for a sentence"""
    #remove all punctuation from query
    q = re.sub(r'[^\w\s]','',q)
    #split by word
    q = q.split(" ")
    # remove extra spaces
    q = [i.strip() for i in q]
    query_vec = np.zeros(300)
    #adds the vectors of all individual words to get
    for w in q:
        try:
            query_vec+=model[w]
        except KeyError:
            pass
    return query_vec

#Get results based on query number
def get_results(q_num):
    filename = "../data/user_studies/results_"+str(q_num)+".txt"
    f = open(filename, "r")
    lst = f.readlines()
    lst = [i.split("--") for i in lst]
    lst = list(zip(*lst))
    scores = [float(i.strip()) for i in lst[0]]
    results = [i.strip() for i in lst[1]]
    return scores, results

In [None]:
query_number=3
query = queries[query_number]
scores, results = get_results(query_number)
query

In [None]:
#distribution of the length of the sentences in the query results
sentence_lengths = np.array([len(x.split(" ")) for x in results])
plt.title("Sentence Length (Words)")
plt.hist(sentence_lengths, bins=50)
plt.show()
print("Percent of sentences with word length greater than 50 is", 
      np.count_nonzero(sentence_lengths>50)/len(sentence_lengths)*100, "%")

## Clustering results

In [None]:
#make a distance matrix in which you take pairwise similairties between
dist_matrix = np.zeros([1000, 1000])
vectors  = [get_sentence_vector(x) for x in results]
dist_matrix = 1-cosine_similarity(vectors, vectors) # 1-cosine similarity to be more intuitive when using as a distance
dist_matrix

In [None]:
il1 = np.tril(dist_matrix, -1)
lower_triangle = dist_matrix[il1>0]
lower_triangle

In [None]:
#distribution of distances for the first 1000 query results
plt.hist(lower_triangle, bins=50, normed=True)
plt.title("Distances Histogram")
plt.show()

In [None]:
#pick a small epsilon to draw an edge between results
epsilon = np.percentile(lower_triangle, 3)
epsilon

In [None]:
#make a list of edges from the adjacency matrix
import networkx as nx
edges = []
adj_matrix = dist_matrix<epsilon #make adjacency matrix
for i in range(1000):
    for j in range(1000):
        if adj_matrix[i, j] and i!=j:
            edges.append((i,j))
G = nx.Graph(edges)
#number of connected components in Graph
len(list(nx.connected_components(G)))

In [None]:
#remove all the vertices with a degree of 80 or more 
low_degree = [x  for x, v in G.degree().items() if v<80]
high_degree = [x  for x, v in G.degree().items() if v>=80]
most_connected_sentences = np.array(results)[high_degree]
H = G.subgraph(low_degree)
len(high_degree)

In [None]:
#makes json which is important for graph visualization
# Now you can go and fire up index.html in the browser
import json

graph = {}
graph['nodes'] = [{'id': node, 'group': 1, 'text':results[node]} for node in H.nodes()]
graph['links'] = [{'source': i, 'target': j, 'value': 1} for (i,j) in H.edges()]

   
f = open('graph.json', 'w')
f.write(json.dumps(graph))
f.close()

In [None]:
#number of edges and vertices in subgraph
len(H.edges()), len(H.nodes())

## Finding meaning within clusters

In [None]:
from community import community_louvain
#this looks more like the partition the the code in js gives as opposed to best_partition
d = community_louvain.generate_dendrogram(H)
partition = community_louvain.partition_at_level(d, 2)

In [None]:
#make an array where in the ith position, we have a list of all the nodes belonging to that community
communities = [[]]*(max(partition.values())+1)
for node, com in partition.items():
    communities[com] = communities[com]+[node]

In [None]:
#community sizes
import pandas as pd
import nltk
pd.Series([len(x) for x in communities]).value_counts()

In [None]:
#try to find some common keywords within communities
com5 = communities[4] #community of size 70

In [None]:
sentences_in_community = np.array(results)[com5]

In [None]:
from nltk.corpus import stopwords
from string import punctuation
import itertools

def clean_sentence(s):
    """Returns a list of all the words in a sentence with no stop words and no punctuation"""
    tokenized_sentence = [i.strip() for i in nltk.word_tokenize(s.lower())]
    words = [w for w in tokenized_sentence if w not in stopwords.words('english') and w not in punctuation]
    return words

#Find most relevant words

def make_dictionary(docs):
    """Takes in a list of documents and returns an alphabetically sorted list of all distinct words"""
    s = list(set(list(itertools.chain.from_iterable(docs))))
    s = list(sorted(s))
    return dict([(s[i], i) for i in range(len(s))])

def count_instances(map_, doc):
    """Counts instances words in docs given a dictionary"""
    count_array = np.zeros(len(map_.keys()))
    for word, index in map_.items():
        count_array[index] = doc.count(word)+1
    return count_array

def find_ngrams(input_list, n):
      if type(n)==list:
            lst = []
            for i in n:
                lst.extend(list(zip(*[input_list[j:] for j in range(i)])))
            return lst
      return list(zip(*[input_list[i:] for i in range(n)]))

In [None]:
#Find most common words for cluster #4
sentences_in_cluster = np.array(results)[communities[4]]
ngrams = [find_ngrams(clean_sentence(i), [4,5]) for i in sentences_in_cluster]
dictionary = make_dictionary(ngrams)
a = np.ones(len(dictionary))
for s in ngrams:
    a+=count_instances(dictionary, s)
best_scored = np.argsort(a)[::-1]
rev_dict = dict(map(reversed, dictionary.items()))
[rev_dict[i] for i in best_scored[:10]]

In [None]:
#Find most common words for cluster #5
sentences_in_cluster = np.array(results)[communities[5]]
ngrams = [find_ngrams(clean_sentence(i), [4,5]) for i in sentences_in_cluster]
dictionary = make_dictionary(ngrams)
a = np.ones(len(dictionary))
for s in ngrams:
    a+=count_instances(dictionary, s)
best_scored = np.argsort(a)[::-1]
rev_dict = dict(map(reversed, dictionary.items()))
[rev_dict[i] for i in best_scored[:10]]

## sklearn to the rescue + wordcloud implementation

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter
import operator

vectorizer = CountVectorizer(min_df=1, stop_words='english')
analyze = vectorizer.build_analyzer()

def top_ngrams(communities,cluster, n=20):
    """Input:
        communities: List of lists that contain indices corresponding to sentences in the original array
        cluster: integer, the cluster to inspect
        n: integer, number of most common """
    sentences = np.array(results)[communities[cluster]]
    ngrams = []
    for s in sentences:
        ngrams.extend(analyze(s))

    counts = Counter(ngrams)

    return sorted(counts.items(), key=operator.itemgetter(1))[::-1][:n]

def get_wordcloud_json(ngram):
    wordcloud = []
    for word, count in ngram:
        wordcloud.append({"text":word, "size":count})
    return wordcloud

In [None]:
wordcloud = get_wordcloud_json(top_ngrams(communities,8, n=50))
f = open('wordcloud.json', 'w')
f.write(json.dumps(wordcloud))
f.close()

## Top verbs with spacy

In [None]:
from spacy.en import English
nlp = English()

In [None]:
#clustering using verbs
def get_verb_counts(cluster):
    """Returns a dictionary that maps words to their count"""
    com = np.array(results)[communities[cluster]]
    verbs = {}
    for post in com:
        doc = nlp(str(post))
        for sentence in doc.sents:
            if sentence.root.lemma_ not in verbs.keys():
                verbs[sentence.root.lemma_] = 1
            else:
                verbs[sentence.root.lemma_] += 1
    return verbs

# def find_different_elements(s1, s2):
#     """Given two lists, returns two lists with the sets minus their intersection"""
#     s1 = set(s1)
#     s2 = set(s2)
#     return list(s1.difference(s1.intersection(s2))), list(s2.difference(s1.intersection(s2))) 

def find_top_counts(dict_, n=10):
    """Given a dictionary of verb frequencies, returns the n most common verbs"""
    return [u for u,v in sorted(dict_.items(), key=operator.itemgetter(1))][::-1][:n]

## Top adjectives with spacy

In [None]:
def get_adj_counts(cluster):
    """Returns a dictionary that maps words to their count"""
    com = np.array(results)[communities[cluster]]
    adjs = {}
    for post in com:
        doc = nlp(str(post))
        for sentence in doc.sents:
            for w in sentence:
                if w.pos_ == 'ADJ' and w.dep_ == 'amod':
                    if w.lemma_ not in adjs.keys():
                        adjs[w.lemma_] = 1
                    else:
                        adjs[w.lemma_]+=1
    return adjs

In [None]:
#Get tags for all clusters that have a size bigger than 20
tags = []
for c in range(len(communities)):
    d = {'cluster':c, 'top_verbs':[], 'verb_counts':[], 'adjs_counts':[], 'top_adjs':[]}
    if len(communities[c])>20:
        verb_counts= get_verb_counts(c)
        top_verbs = find_top_counts(verb_counts, 20)
        top_verb_counts = [verb_counts[x] for x in top_verbs]
        d['top_verbs'] = top_verbs
        d['verb_counts'] = top_verb_counts
        adj_counts = get_adj_counts(c)
        top_adj = find_top_counts(adj_counts, 20)
        top_adj_counts = [adj_counts[x] for x in top_adj]
        d['top_adjs'] = top_adj
        d['top_adjs_counts'] = top_adj_counts
    tags.append(d)

## New graph

In [None]:
#using clustering from python and using verb as text that appears
import json

graph = {}
graph['nodes'] = [{'id': node, 'group': partition[node], 'text':"Top verbs: "+
                   ", ".join(tags[partition[node]]['top_verbs'][-5:])
                            +"\n" + "Top adjectives: "+
                             ", ".join(tags[partition[node]]['top_adjs'])} for node in H.nodes()]
graph['links'] = [{'source': i, 'target': j, 'value': 1} for (i,j) in H.edges()]
    

f = open('graph.json', 'w')
f.write(json.dumps(graph))
f.close()

In [None]:
1. look at adjectives
2. modifiers of adjectives, tree dependencies on spacy
unicode
3. lemmatize verbs