In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction
import mpld3
from bs4 import BeautifulSoup as bs

In [3]:
#import three lists: attractions, address and wikipedia synopses
attractions = open('LondonAttrList.csv').read().split('\n')
#ensures that only the first 100 are read in
attractions = attractions[:100]

address = open('LondonAddressList.csv').read().split('\n')
address = address[:100]

description = open('LondonAttrDescription.csv').read().split('\n')
description = description[:100]

synopses_clean_wiki = []
for text in description:
    text = bs(text, 'html.parser').getText()
    #strips html formatting and converts to unicode
    synopses_clean_wiki.append(text)

description = synopses_clean_wiki
    
print(str(len(attractions)) + ' attractions')
print(str(len(address)) + ' address')
print(str(len(description)) + ' synopses')


100 attractions
100 address
100 synopses


In [7]:
ranks = []

for i in range(0,len(attractions)):
    ranks.append(i)

In [9]:
# load nltk's English stopwords as variable called 'stopwords'
stopwords = nltk.corpus.stopwords.words('english')

In [10]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [11]:
# here I define a tokenizer and stemmer which returns the set of stems in the text that it is passed

def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [12]:
totalvocab_stemmed = []
totalvocab_tokenized = []
try:
    for i in attractions:
        allwords_stemmed = tokenize_and_stem(i)
        totalvocab_stemmed.extend(allwords_stemmed)

        allwords_tokenized = tokenize_only(i)
        totalvocab_tokenized.extend(allwords_tokenized)
except:
    pass


In [13]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_df=0.9, max_features=200000,
                                 min_df=0.1, stop_words='english',
                                 use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1,3))

%time tfidf_matrix = tfidf_vectorizer.fit_transform(attractions)

print(tfidf_matrix.shape)



CPU times: user 28.2 ms, sys: 0 ns, total: 28.2 ms
Wall time: 27.8 ms
(100, 2)


In [17]:
terms = tfidf_vectorizer.get_feature_names()

In [18]:
from sklearn.metrics.pairwise import cosine_similarity
dist = 1 - cosine_similarity(tfidf_matrix)

In [19]:
from sklearn.cluster import KMeans
num_clusters = 2
km =KMeans(n_clusters=num_clusters)

%time km.fit(tfidf_matrix)
clusters = km.labels_.tolist()

CPU times: user 15.3 ms, sys: 0 ns, total: 15.3 ms
Wall time: 14.5 ms


In [28]:
films = { 'title': attractions, 'rank': address, 'synopsis': description, 'cluster': clusters}

frame = pd.DataFrame(films, index = [clusters] , columns = ['rank', 'title', 'cluster', 'genre'])

In [29]:
frame['cluster'].value_counts()

grouped = frame['rank'].groupby(frame['cluster'])

grouped.mean()

DataError: No numeric types to aggregate

In [31]:


from __future__ import print_function

print("Top terms per cluster:")
print()
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    for ind in order_centroids[i, :6]:
        print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    print()
    print()
    print("Cluster %d titles:" % i, end='')
    for title in frame.ix[i]['title'].values.tolist():
        print(' %s,' % title, end='')
    print()
    print()



Top terms per cluster:

Cluster 0 words: hotel, st,

Cluster 0 titles: Bedford Square, British Telecom Tower, Bloomsbury Square, Coram's Fields, Russell Square, British Museum, Cartoon Museum, Charles Dickens Museum, Foundling Museum, Fitzroy House, Petrie Museum of Egyptian Archaeology, Pollock's Toy Museum, Wellcome Collection, Goodenough College, University College London, BBC Broadcasting House, Birkbeck Cinema, Bloomsbury Theatre, Dominion Theatre, Renoir, Walks, Bloomsbury Festival, London Mathematical Society, Gay's the Word, Judd Books, Marchmont Books, Persephone Books, Photo Books International, Skoob, The Brunswick Centre, Attendant, Mary Ward Cafe, Ravi Shankar, Salaam Namaste, Valtaro Snack Bar, YouMeSushi, Indian YMCA, Great Court Restaurant, Crazy Bear, Hakkasan, Pied Ã  Terre, Sam Smith's Pubs, Fitzroy Tavern, The Yorkshire Grey, The Cock, The Horse and Groom, The Champion, The Blue Posts, The Bricklayers Arms, The College Arms, Bubbledogs, The Jeremy Bentham, The Lord 