In [17]:
import pandas as pd
import glob
import os
import re
import operator
import sys
import string
import numpy as np
from dateutil import *
from dateutil.relativedelta import *

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
tfidf = TfidfVectorizer()

In [3]:
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
tknzr = TweetTokenizer()

In [4]:
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
from sklearn import metrics

In [5]:
def tokenize(text):
    text = re.sub(r'\b-\b', " ", text) # removing hyphens
    text = re.sub(r'\'s\b', '', text) # removing "'s"
    text = re.sub(r'((JUST IN)|BBCBreaking|SkyNewsBreak|cnnbrk)', "BREAKING", text)
    text = "".join([ch for ch in text if ch not in string.punctuation])
    tokens = tknzr.tokenize(text)
    tokens = [word for word in tokens if word.lower() not in stopwords.words('english')]
    tokens = [word for word in tokens if word.lower() not in ['rt','via','ajenews','ap']]
    return tokens

In [10]:
path = '/Users/mishakobiliansky/Documents/datascience/nlp/twitter-news/archive/Oct_12b' # specify your path here
os.chdir(path)

In [11]:
# settings
num_files = 6 # max number of files to read (6 = 1 hour worth)
num_tweets = 100 # number of tweets sufficient to begin analysis
keyword = 'BREAKING' # keyword for breaking news
cluster_method = 'AP' # clustering method ('AP' = affinity propagation, 'KM' = KMeans, 'SC' = spectral clustering)
damping = 0.75 # affinity propagation damping parameter
n_clusters = 25 # number of clusters
threshold = 5 # count of unique agencies in a cluster to be considered
retweet_threshold = 0.3 # retweet per second ratio threshold
max_results = 5 # number of results to send per day

### make dataframe

In [12]:
# get last batch number
file_numbers = []
for name in glob.glob('tweets.*[0-9].*.csv'):
    file_numbers.append(int(name.split(".")[1]))
last_batch = max(file_numbers)

In [398]:
#if needed run on a batch other than latest
#last_batch = 302

In [13]:
# counting result files in folder
sent_result = 0
for name in glob.glob('results[0-9].csv'):
    sent_result +=1
print sent_result

2


In [14]:
# reading files
batch_nums = [int(name.split(".")[1]) for name in glob.glob('tweets.*[0-9].*.csv') if int(name.split(".")[1]) <= last_batch]
last_tweets = []
for batchnum in sorted(batch_nums, reverse=True):
    currentfile = glob.glob('tweets.'+str(batchnum)+'.*.csv')[0]
    last_tweets.append(currentfile)
    if len(last_tweets) == 1:
        df_tweets = pd.read_csv(currentfile)
    else:
        df_tweets = pd.concat([df_tweets, pd.read_csv(currentfile)], ignore_index=True)
    if len(df_tweets) >= num_tweets or len(last_tweets) >= num_files: break    

# making dataframe
df_from_each_file = (pd.read_csv(f) for f in last_tweets)
df_tweets = pd.concat(df_from_each_file, ignore_index=True)
print len(df_tweets), 'tweets in dataframe'

412 tweets in dataframe


In [15]:
clean_df = df_tweets.copy()

In [18]:
### time difference between tweet download and tweet created
clean_df['time_created'] = clean_df['created_at'].apply(parser.parse, ignoretz=True)
clean_df['time_created_local'] = clean_df['time_created'].apply(lambda x: x-relativedelta(hours=+7))
clean_df['downloaded_at'] = clean_df['downloaded_at'].apply(parser.parse, ignoretz=True)
clean_df['delta'] = clean_df['downloaded_at'] - clean_df['time_created_local']
clean_df['delta'] = clean_df['delta'].apply(lambda x: x.total_seconds())
clean_df['retweet_ratio'] = clean_df['retweet_count'] / clean_df['delta']

In [326]:
result_id = []
# if previous result files exist, make sure sent tweets are in current dataset
for name in glob.glob('results[0-9].csv'):
    result_df = pd.read_csv(name)
    result_id.append(int(result_df['id'][0]))
    if int(result_df['id'][0]) not in clean_df['id'].tolist():
        # add tweet to dataset
        print 'adding tweet'
        print result_id, result_df['text'][0]
        clean_df = pd.concat([result_df,clean_df])

In [19]:
clean_df.shape

(412, 11)

In [20]:
# removing urls from tweet text
clean_df['text'] = clean_df['text'].apply(lambda x: re.sub(r'(\s?https?:\S+\s?)', '', x))
# convert to unicode
clean_df['utext'] = clean_df['text'].apply(lambda x: unicode(x, 'utf-8'))
# remove non-ascii characters
clean_df['utext'] = clean_df['utext'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', "", x))
# remove empty rows
clean_df = clean_df[clean_df['utext'].map(len) > 0].reset_index()

In [21]:
# tokenize
clean_df['tokens'] = clean_df['utext'].apply(tokenize)

In [None]:
# tagging
#from nltk import pos_tag
#clean_df['tags'] = clean_df['tokens'].apply(pos_tag)

# leave nouns only
#clean_df['clean_tags'] = clean_df['tags'].apply(lambda x: [tag[0] for tag in x if tag[1] in ['NN','NNS','NNP','NNPS','CC','CD']])

In [23]:
# generate similarity matrix
text = [' '.join(i) for i in clean_df['tokens']] #based on tokens
#text = [' '.join(i) for i in clean_df['clean_tags']] #based on tags (nouns)
vectors = tfidf.fit_transform(text) # based on current dataset
#vectors = all_tfidf.transform(text) # based on all tweet vocabulary
cosine_matrix = cosine_similarity(vectors)
cosine_matrix.shape

(412, 412)

### clustering

In [36]:
# affinity propagation
af = AffinityPropagation(affinity='precomputed', damping=0.75).fit(cosine_matrix)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
clean_df['cluster'] = labels
n_clusters = len(cluster_centers_indices)
print n_clusters, 'clusters'

83 clusters


In [28]:
# k-means
km = KMeans(n_clusters = n_clusters).fit(cosine_matrix)
cluster_centers = km.cluster_centers_
labels = km.labels_
clean_df['cluster'] = labels
n_clusters = len(cluster_centers)
print n_clusters, 'clusters'

83 clusters


In [32]:
# spectral clustering
sc = SpectralClustering(affinity = 'precomputed', n_clusters = n_clusters).fit(cosine_matrix)
labels = sc.labels_
clean_df['cluster'] = labels



### choosing cluster

In [33]:
# the clusters
df_clusters = clean_df.groupby('cluster').agg({'id': len,'agency': pd.Series.nunique, 'retweet_ratio': np.mean})
df_clusters.sort_values('retweet_ratio', ascending = False).head()

Unnamed: 0_level_0,agency,id,retweet_ratio
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
52,3,4,0.584346
11,3,5,0.498681
22,2,2,0.286934
55,4,5,0.262771
9,6,6,0.093402


In [34]:
cluster_ratio = clean_df.groupby('cluster').agg({'retweet_ratio': np.mean}).reset_index()
#cluster_size = clean_df.groupby('cluster').size().reset_index()
top4_clusters = cluster_ratio.sort_values('retweet_ratio', ascending = False)[:4]

In [35]:
for cluster in top4_clusters['cluster']:
    for entry in clean_df['text'][clean_df['cluster'] == cluster]:
        print cluster, entry
    print '======================================================'

52 RT @BBCBreaking: Thai King Bhumibol Adulyadej "passed away peacefully," palace says

52 JUST IN: Sec. Kerry on passing of King Bhumibol: "The US stands with the people of Thailand at this difficult time.…
52 JUST IN: Pres. Obama on the passing of King Bhumibol: "Today we hold the Thai people in our thoughts and prayers."…
52 World leaders and royalty mourn the death of Thai King Bhumibol Adulyadej
11 RT @BBCBreaking: He ruled for 70 years - the life of Thai King Bhumibol Adulyadej, who has died
11 Thailand's King Bhumibol Adulyadej's life in pictures
11 RT @AJENews: Life in pictures: Thai King Bhumibol Adulyadej
11 When LIFE photographed Thai King Bhumibol Adulyadej in 1960
11 Life in pictures: King Bhumibol Adulyadej of Thailand
22 BREAKING: Thai prime minister says Crown Prince Maha Vajiralongkorn will be the new monarch in accordance with the constitution.
22 RT @BBCBreaking: Crown Prince Maha Vajiralongkorn to succeed Thai king, prime minister says
55 Thailand's Royal Palace say

In [350]:
clean_df[clean_df['cluster'] == 8][['id','tokens','agency','retweet_count','retweet_ratio']].sort_values('retweet_ratio')

Unnamed: 0,id,tokens,agency,retweet_count,retweet_ratio
326,786586055134834688,"[American, Honey, review, highway, sell]",The Guardian,0,0.0
339,786577873922625536,"[American, Honey, review, highway, sell]",The Guardian,6,0.003052


### cluster selecting

In [275]:
# remove previously sent clusters, if any
result_cluster = []
print 'removing previous clusters'
for x in result_id:
    result_cluster.append(int(clean_df[clean_df['id'] == x]['cluster'][:1]))
clean_df = clean_df[~clean_df['cluster'].isin(result_cluster)]

removing previous clusters
790614093602361344
7
7
790636962289946624
3
3


In [277]:
# unique agencies => breaking keyword => retweet ratio
chosen_cluster = None
big_clusters = []
breaking_clusters = []
high_retweet_clusters = []
# 1. Count distinct agencies in a cluster
cluster_users = clean_df.groupby('cluster').agency.nunique().reset_index()
max_users_cluster = max(cluster_users['agency'])
print 'max unique agencies in a cluster: %d' %max_users_cluster
big_clusters = cluster_users['cluster'][cluster_users['agency'] >= threshold].tolist()
df_chosen = clean_df[clean_df['cluster'].isin(big_clusters)]
if big_clusters:
	print 'identified %d clusters with more than %d unique agencies' %(len(big_clusters),threshold)
	# 2. At least 1 instances of "BREAKING"
	for cluster in big_clusters:
		for entry in df_chosen[df_chosen['cluster'] == cluster]['tokens']:
			if keyword in ' '.join(entry):
				if cluster not in breaking_clusters:
					breaking_clusters.append(cluster)
	if breaking_clusters:
		print 'identified %d breaking-news clusters' % len(breaking_clusters)
		df_chosen = df_chosen[df_chosen['cluster'].isin(breaking_clusters)]
		# 3. Retweet_ratio is high
		for cluster in breaking_clusters:
			if np.mean(df_chosen[df_chosen['cluster'] == cluster]
				.sort_values('retweet_ratio', ascending = False)[:3]['retweet_ratio']) >= retweet_threshold:
				high_retweet_clusters.append(cluster)
		if high_retweet_clusters:
			df_chosen = df_chosen[df_chosen['cluster'].isin(high_retweet_clusters)]
			print 'identified %d clusters with high retweet ratio' % len(high_retweet_clusters)
			average_ratios = []
			for cluster in high_retweet_clusters:
				average = np.mean(df_chosen[df_chosen['cluster'] == cluster]
					.sort_values('retweet_ratio', ascending = False)[:3]['retweet_ratio'])
				average_ratios.append(average)
			print 'retweet ratios (top 3 tweets):', average_ratios
			# choose cluster with highest retweet ratio
			chosen_cluster = int(df_chosen.sort_values('retweet_ratio', ascending = False)[:1]['cluster'])
			df_chosen = df_chosen[df_chosen['cluster'] == chosen_cluster]
			# choosing best tweet
			df_chosen = df_chosen.sort_values('retweet_ratio', ascending = False)[:1]
			result = df_chosen[['id','text','url','agency','created_at']][df_chosen['retweet_ratio'] == max(df_chosen['retweet_ratio'])]
			sent_result +=1
			result_name = 'results' +str(sent_result)
			result.to_csv(result_name +'.csv')
			for entry in result['text']:
				print 'selected tweet: %s' % (entry)
		else: print 'no clusters with high enough retweet ratio identified'
	else: print 'no breaking-news clusters identified'
else: print 'no large enough clusters identified'

max unique agencies in a cluster: 20
identified 4 clusters with more than 5 unique agencies
no breaking-news clusters identified


In [None]:
# unique agencies => retweet ratio
chosen_cluster = None
big_clusters = []
#breaking_clusters = []
high_retweet_clusters = []

# 1. At least 5 distinct agencies in a cluster
threshold = 5
cluster_users = clean_df.groupby('cluster').agency.nunique().reset_index()
max_users_cluster = max(cluster_users['agency'])
print 'max unique agencies in a cluster: %d' %max_users_cluster
big_clusters = cluster_users['cluster'][cluster_users['agency'] >= threshold].tolist()
df_chosen = clean_df[clean_df['cluster'].isin(big_clusters)]
if big_clusters:
    print 'identified %d clusters with more than %d unique agencies' %(len(big_clusters),threshold)
    # 3. Retweet_ratio is high
    retweet_threshold = 0.2
    for cluster in big_clusters:
        if np.mean(df_chosen[df_chosen['cluster'] == cluster]
                .sort_values('retweet_ratio', ascending = False)[:3]['retweet_ratio']) >= retweet_threshold:
            high_retweet_clusters.append(cluster)
    if high_retweet_clusters:
        df_chosen = df_chosen[df_chosen['cluster'].isin(high_retweet_clusters)]
        print 'identified %d clusters with high retweet ratio' % len(high_retweet_clusters)
        average_ratios = []
        for cluster in high_retweet_clusters:
            average = np.mean(df_chosen[df_chosen['cluster'] == cluster]
                .sort_values('retweet_ratio', ascending = False)[:3]['retweet_ratio'])
            average_ratios.append(average)
        print 'retweet ratios (top 3 tweets):', average_ratios
        # choose cluster with highest retweet ratio
        chosen_cluster = int(df_chosen.sort_values('retweet_ratio', ascending = False)[:1]['cluster'])
        df_chosen = df_chosen[df_chosen['cluster'] == chosen_cluster]
        # choosing best tweet
        df_chosen = df_chosen.sort_values('retweet_ratio', ascending = False)[:1]
        result = df_chosen[['id','text','url','agency','created_at']][df_chosen['retweet_ratio'] == max(df_chosen['retweet_ratio'])]
        sent_result +=1
        result_name = 'results' +str(sent_result)
        result.to_csv(result_name +'.csv', index = False)
        for entry in result['text']:
            print 'selected tweet: %s' % (entry)
    else: print 'no clusters with high enough retweet ratio identified'
else: print 'no large enough clusters identified'

### testing

In [None]:
### using mitie: name entity extraction
from mitie import *
from collections import defaultdict

mitie_path = '/Applications/anaconda_python_2.7/anaconda/lib/python2.7/site-packages/mitie'
sys.path.append(mitie_path)

print "loading NER model..."
ner = named_entity_extractor(mitie_path +'/MITIE-models/english/ner_model.dat')
print "\nTags output by this NER model:", ner.get_possible_ner_tags()

def entity_extraction(tokens):
    entities = ner.extract_entities(tokens)
    tweet_entities = [" ".join(tokens[i] for i in e[0]) for e in entities]
    return tweet_entities

# extract entities
clean_df['entities'] = clean_df['tokens'].apply(entity_extraction)
clean_df['_entities_'] = clean_df['entities'].apply(lambda x: ['_'+entry+'_' for entry in x])
clean_df['token_entities'] = clean_df['tokens'] + clean_df['_entities_']

### all tweets

In [7]:
all_tweets = []
df_from_each_file = (pd.read_csv(f) for f in glob.glob('tweets.*[0-9].*.csv'))
df_root = pd.concat(df_from_each_file, ignore_index=True)
df_from_each_file = (pd.read_csv(f) for f in glob.glob('archive/*/tweets.*[0-9].*.csv'))
df_archive = pd.concat(df_from_each_file, ignore_index=True)
df_all_tweets = pd.concat([df_root, df_archive], ignore_index=True)

In [8]:
len(df_all_tweets)

23699

In [390]:
# removing urls from tweet text
df_all_tweets['text'] = df_all_tweets['text'].apply(lambda x: re.sub(r'(\s?https?:\S+\s?)', '', x))
# convert to unicode
df_all_tweets['utext'] = df_all_tweets['text'].apply(lambda x: unicode(x, 'utf-8'))
# remove non-ascii characters
df_all_tweets['utext'] = df_all_tweets['utext'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', "", x))
# remove empty rows
df_all_tweets = df_all_tweets[df_all_tweets['utext'].map(len) > 0].reset_index()

In [391]:
# tokenize
df_all_tweets['tokens'] = df_all_tweets['utext'].apply(tokenize)
all_text = [' '.join(i) for i in df_all_tweets['tokens']] #based on tokens
all_tfidf = tfidf.fit(all_text) #TFIDF

In [None]:
with open('keys.txt') as k:
    