In [1]:
import pandas as pd
import numpy as np
import datetime 

vinb = pd.read_csv("vinb2.txt", sep="\t", header=0, encoding= 'utf-8', parse_dates = ["Date"])

#drop tweets sent after 00:15. Show ended at 00:00, time is off by one hour
vinb = vinb[vinb.Date < datetime.datetime(2015, 06, 24, 23, 15, 00)] 
vinb = vinb.reset_index(drop=True) #drops about 15 tweets, reindex so that first tweet is 0
vinb.head()

Unnamed: 0,User,Text,Date,Retweet Count,Reply To
0,Devine147,1 thing I did notice at the water protest was ...,2015-06-24 23:11:04,0,
1,ger_mccann,#vinb If fg policies don't suit yer man from f...,2015-06-24 23:08:42,3,
2,rosecaroline9,#vinb @BGriffinTD congrats on recent arr of y...,2015-06-24 23:07:13,0,
3,strumpetcity,@R2WKillarney Shows like #vinb can often be ch...,2015-06-24 23:06:06,0,R2WKillarney
4,RayMcGrath,I see @IsFearrAnStar is squeezing yet more 'st...,2015-06-24 23:03:57,0,


In [2]:
import nltk
import re
from sklearn import feature_extraction

stopwords = set(nltk.corpus.stopwords.words('english'))

#import lots of cool things!

In [3]:
#the following block is adapted from a bag of words tutorial at https://www.kaggle.com/c/word2vec-nlp-tutorial

#method to clean each tweet
def tweet_to_words(tweet):
    tweet = tweet.lower()    
    tweet = re.sub("&amp;", " ", tweet) #&amp; managed to escape all cleaning and had to be delt with specially    
    tweet = re.sub(r'https?:\/\/([\da-z\.\/-]+)', ' ', tweet) #remove html links    
    noVin = re.sub(r'#vinb', ' ', tweet) #remove #vinb as this appears in every tweet   
    letters_only = re.sub("[^a-zA-Z]", " ", noVin) #remove anything that isn't a letter   
    words = letters_only.split() #split into individual words   
    meaningful_words = [w for w in words if not w in stopwords] #include only words NOT in stopwords  
    clean_tweet = (" ".join(meaningful_words)) #reasemble tweet
    return clean_tweet #return the cleaned tweet

In [4]:
clean_tweets = []

for tweet in vinb.Text:
    clean = tweet_to_words(tweet)
    clean_tweets.append(clean)

A lot of the following code is taken from or inspired by this excellent <a href = "http://brandonrose.org/clustering", target="_blank">Document Clustering</a> tutorial

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(use_idf=True, ngram_range =(1,3))

train_data_features = vectorizer.fit_transform(clean_tweets)

In [6]:
terms = vectorizer.get_feature_names()

In [21]:
from sklearn.cluster import KMeans

num_clusters = 15

km = KMeans(n_clusters=num_clusters)
km.fit(train_data_features)

clusters = km.labels_.tolist()

In [22]:
clusterframe = pd.DataFrame(clusters, columns = ["cluster"]) #turns the list of clusters into a dataframe
clustered_debate = pd.concat([vinb, clusterframe], axis = 1) #combines the tweets with  the clusters

In [23]:
clustered_debate.head()

Unnamed: 0,User,Text,Date,Retweet Count,Reply To,cluster
0,Devine147,1 thing I did notice at the water protest was ...,2015-06-24 23:11:04,0,,9
1,ger_mccann,#vinb If fg policies don't suit yer man from f...,2015-06-24 23:08:42,3,,13
2,rosecaroline9,#vinb @BGriffinTD congrats on recent arr of y...,2015-06-24 23:07:13,0,,13
3,strumpetcity,@R2WKillarney Shows like #vinb can often be ch...,2015-06-24 23:06:06,0,R2WKillarney,5
4,RayMcGrath,I see @IsFearrAnStar is squeezing yet more 'st...,2015-06-24 23:03:57,0,,14


In [24]:
from __future__ import print_function

print("Top terms per cluster:")
print()
#sort cluster centers by proximity to centroid
order_centroids = km.cluster_centers_.argsort()[:, ::-1] #from start to finish, reverse array

for i in range(num_clusters):
    print("Cluster %d words:" % i, end='') 
    
    for ind in order_centroids[i, :5]: #will print 5 most common words
        print(' %s' % terms[ind], end=',') 
    print() 
    print('Length: %d' % len(clustered_debate.Text[clustered_debate.cluster == i])) #prints cluster length, ie no of tweets in each cluster
    print()

Top terms per cluster:

Cluster 0 words: one, ireland, would, tv, fuck,
Length: 53

Cluster 1 words: thornton, fella, grant, grant thornton, name,
Length: 24

Cluster 2 words: peoplesdebate, people, great, kerry, day,
Length: 97

Cluster 3 words: pipe, peoplesdebate pipe, peoplesdebate, pipe peoplesdebate, irishwater,
Length: 18

Cluster 4 words: rae, healy rae, healy, michael, michael healy rae,
Length: 51

Cluster 5 words: people, labour, kerry, people kerry, kerry people,
Length: 51

Cluster 6 words: ferris, martin, martin ferris, man, well,
Length: 48

Cluster 7 words: post, post offices, offices, good, re,
Length: 60

Cluster 8 words: tralee, rose, rose tralee, way, brogue tralee,
Length: 18

Cluster 9 words: get, jackie, jackie crowe, crowe, right,
Length: 85

Cluster 10 words: healy raes, raes, healy, may, il,
Length: 19

Cluster 11 words: debate, back, tonight, back door, debate tonight,
Length: 22

Cluster 12 words: health, mental, mental health, ff, health service,
Length: 36

With no ngrams_range set (default of 1) there was a tendency for an "uber-cluster" to appear. Even at 30 clusters one of them contained over 33% of the tweets. However, the tweets in the other clusters seemed to be very strongly correlated. In the above, with ngrams_range set to (1, 3) the important words seems a lot better but the clusters themselves often contain very disparate tweets. 

Below are sample tweets from each of the clusters.

In [25]:
for i in range(num_clusters):
    print("Cluster %d words:" % i, end='')
    for ind in order_centroids[i, :5]: 
        print(' %s' % terms[ind], end=',')
    print()
    for text in pd.DataFrame(clustered_debate.Text[clustered_debate.cluster == i]).Text.head(10):
        print(text)
    print()
    print()

Cluster 0 words: one, ireland, would, tv, fuck,
#vinb if fg,lab or ff where building companies, would u trust them to build ur home.
Love how mental Ireland is...TV Political debate getting heated ...RIGHT enough of that, bring out the music &amp; dancers for a session! #vinb
Ireland Debt Clock :: National Debt of Ireland http://t.co/gc2aEwIjk9 #vinb
BREAKING   #TV3 announce a new talent search show, being led by Vincent Brown in the Simon Cowell role, other judges, M'Lod, O'Murchu #vinb
none of them would get a summer job in a hole #vinb ticks not politics
#vinb @TonightWithVinB TV3 could devote a lot more time to historical documentary, enough with the crime specials.
Independence Day my eye! Ireland is not sovereign! Bleating on about the past is not going to help the future #VINB
Wouldn't it be intriguing, if Ireland for one moment, sought to look forward to a history that beckons....?  #vinb
What the fuck had Kerry to do with 1916? Have I missed something, did something actually h

In the following block change the number on the first row to that of the cluster you want and change the number on the second row to the number of tweets from that cluster that you want. The below example prints out 10 tweets from cluster0

In [26]:
display = pd.DataFrame(clustered_debate.Text[clustered_debate.cluster == 0])
for text in display.Text.head(10):
    print(text)

#vinb if fg,lab or ff where building companies, would u trust them to build ur home.
Love how mental Ireland is...TV Political debate getting heated ...RIGHT enough of that, bring out the music &amp; dancers for a session! #vinb
Ireland Debt Clock :: National Debt of Ireland http://t.co/gc2aEwIjk9 #vinb
BREAKING   #TV3 announce a new talent search show, being led by Vincent Brown in the Simon Cowell role, other judges, M'Lod, O'Murchu #vinb
none of them would get a summer job in a hole #vinb ticks not politics
#vinb @TonightWithVinB TV3 could devote a lot more time to historical documentary, enough with the crime specials.
Independence Day my eye! Ireland is not sovereign! Bleating on about the past is not going to help the future #VINB
Wouldn't it be intriguing, if Ireland for one moment, sought to look forward to a history that beckons....?  #vinb
What the fuck had Kerry to do with 1916? Have I missed something, did something actually happen there in 1916  #vinb
You'd swear theres 2 