# Exploration

Help from : https://marcobonzanini.com/2015/05/17/mining-twitter-data-with-python-part-6-sentiment-analysis-basics/


In [1]:
import tweepy
from tweepy import OAuthHandler
 
consumer_key = 'MY_CONSUMER_KEY'
consumer_secret = 'MY_CONSUMER_SECRET'
access_token = 'MY_ACCESS_TOKEN'
access_secret = 'MY_ACCESS_SECRET'
 
auth = OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)
 
api = tweepy.API(auth)

## Get stream of tweets

In [2]:
import json

# get stream of tweets
from tweepy import Stream
from tweepy.streaming import StreamListener

## Preprocessing text

We can use a custom tokeniser to split the tweets into a list of terms. The following code uses the preprocess() function, in order to capture Twitter-specific aspects of the text, such as #hashtags

In [3]:
import re
 
emoticons_str = r"""
    (?:
        [:=;] # Eyes
        [oO\-]? # Nose (optional)
        [D\)\]\(\]/\\OpP] # Mouth
    )"""
 
regex_str = [
    emoticons_str,
    r'<[^>]+>', # HTML tags
    r'(?:@[\w_]+)', # @-mentions
    r"(?:\#+[\w_]+[\w\'_\-]*[\w_]+)", # hash-tags
    r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&amp;+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', # URLs
 
    r'(?:(?:\d+,?)+(?:\.?\d+)?)', # numbers
    r"(?:[a-z][a-z'\-_]+[a-z])", # words with - and '
    r'(?:[\w_]+)', # other words
    r'(?:\S)' # anything else
]

def tokenize(s):
    return tokens_re.findall(s)
 
def preprocess(s, lowercase=False):
    tokens = tokenize(s)
    if lowercase:
        tokens = [token if emoticon_re.search(token) else token.lower() for token in tokens]
    return tokens
 
    
tokens_re = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)
emoticon_re = re.compile(r'^'+emoticons_str+'$', re.VERBOSE | re.IGNORECASE)


In [4]:
## Removing stop words
#import nltk
#nltk.download('stopwords')

import operator 
import json
from collections import Counter
from nltk.corpus import stopwords
import string
 
punctuation = list(string.punctuation)
stop = stopwords.words('english') + punctuation + ['rt', 'via', 'RT', '…', '\U0001f923']

In [5]:
#https://dev.twitter.com/streaming/overview/connecting
import os
my_doc_path = os.path.expanduser('~/Documents')
git_loc = os.path.expanduser('~/Github')

tweets_data_path = my_doc_path + '/Refugee_Project/Tweets/tweets_20170709.txt'

tweets_data = []
tweets_file = open(tweets_data_path, "r")

In [6]:
# view a few tweets (text)
for line in tweets_file:
    try:
        tweet = json.loads(line)
        tweets_data.append(tweet)
    except:
        continue
        
        

print('Nb of tweets', len(tweets_data),  '\n') #34,470

for i in range(5):
    print(i, ':', tweets_data[i]['text'], '\n')

Nb of tweets 34470 

0 : RT @MSF_Sea: Doesnt matter if he's an asylum seeker, a refugee, an economic migrant or a jedi knight. He was vulnerable at sea &amp;…  

1 : RT @ubiqsmart: FemRight is a non-profit for health, education, empowerment and inclusion of refugee girls in the Middle East and A…  

2 : RT @mortensode: Godt indspark: Re-definition af #flygtninge begrebet r nødvendigt https://t.co/RyPJUfFdlf 

3 : RT @robabdul: Some GOOD NEWS:
Jim Estill, an amazing man paid for 58 refugee families to resettle in Canada!

#Humanity #hope…  

4 : Europe's Liberal Elite Outraged After Renzi Says Italy Has "No Moral Duty To Take In Migrants"  https://t.co/N1mukccEKE 



In [7]:
# count more recurring terms

count_all = Counter()
count_hash = Counter()
count_at = Counter()
count_only = Counter()

for tweet_id in range(len(tweets_data)):
    if 'text' in tweets_data[tweet_id].keys():
        terms_all = [term for term in preprocess(tweets_data[tweet_id]['text'], lowercase = True) if term not in stop]
        terms_hash = [term for term in preprocess(tweets_data[tweet_id]['text']) if term.startswith('#')]
        terms_at = [term for term in preprocess(tweets_data[tweet_id]['text']) if term.startswith('@')]
        terms_only = [term for term in preprocess(tweets_data[tweet_id]['text']) if term not in stop and not term.startswith(('@', '#'))]
        
        count_all.update(terms_all)
        count_hash.update(terms_hash)
        count_at.update(terms_at)
        count_only.update(terms_only)

#print(count_all.most_common(20), '\n')
print('Hashtags: ', count_hash.most_common(20), '\n')
print('Tags : ', count_at.most_common(20), '\n')
print('Terms : ', count_only.most_common(20), '\n')

Hashtags:  [('#Balochistan', 658), ('#Clooney', 640), ('#UK', 629), ('#Europe', 415), ('#Invasion', 371), ('#ChildrenUprooted', 202), ('#refugee', 199), ('#Refugee', 173), ('#G20', 150), ('#AChildIsAChild', 141), ('#Syria', 129), ('#GeorgeClooney', 112), ('#frees', 110), ('#MAGA', 93), ('#', 86), ('#ma4t', 81), ('#climate', 81), ('#DefendEuropa', 80), ('#Orban', 80), ('#migrant', 68)] 

Tags :  [('@TRobinsonNewEra', 2926), ('@PrisonPlanet', 2219), ('@AmyMek', 1676), ('@DineshDSouza', 1356), ('@V_of_Europe', 1091), ('@joshdcaplan', 1045), ('@ChooseToBFree', 987), ('@StefanMolyneux', 971), ('@PinkBelgium', 741), ('@TheMarkRomano', 729), ('@DavidJo52951945', 709), ('@Timcast', 509), ('@Voices4Humanity', 491), ('@horowitz39', 477), ('@UNICEF', 336), ('@RavenNightMyst', 330), ('@defencepk', 298), ('@JulianBurnside', 272), ('@PeterDutton_MP', 262), ('@bbusa617', 237)] 

Terms :  [('refugee', 9270), ('Clooney', 7396), ('George', 4959), ('Migrant', 4720), ('migrant', 4076), ('amp', 3798), ('Th

In [8]:
# adjacent tokens

from nltk import bigrams 
from collections import Counter

 
    
count_adj = Counter()
for tweet_id in range(len(tweets_data)):
    if 'text' in tweets_data[tweet_id].keys():
        terms_adj = [term for term in preprocess(tweets_data[tweet_id]['text']) if term not in stop and not term.startswith(('@', '#'))]
        terms_bigram = bigrams(terms_adj)
        count_adj.update(terms_bigram)

print(count_adj.most_common(20))

[(('Bin', 'Laden'), 2234), (('fucking', 'idiots'), 2229), (('produce', 'terrorists'), 2229), (('harboured', 'Bin'), 2228), (('idiots', 'You'), 2228), (('terrorists', 'world'), 2228), (('world', 'fucking'), 2228), (('Laden', 'still'), 2228), (('You', 'harboured'), 2228), (('harbour', 'Dawood'), 2228), (('still', 'harbour'), 2228), (('George', 'Clooney'), 1774), (('George', 'refugee'), 1745), (('hypocrite', 'George'), 1744), (('ever', 'bigger'), 1743), (('bigger', 'hypocrite'), 1743), (('Has', 'ever'), 1742), (('refugee', 'Clooney'), 1741), (('Clooney', 'SHARE'), 1737), (('SHARE', 'https://t.co/s9wwox6J8w'), 1721)]


In [9]:
# co-occurences: adjacent tokens
from collections import Counter
from collections import defaultdict
# remember to include the other import from the previous post
 
com = defaultdict(lambda : defaultdict(int))
 
for tweet_id in range(len(tweets_data)):
    if 'text' in tweets_data[tweet_id].keys():
        terms_co = [term for term in preprocess(tweets_data[tweet_id]['text']) if term not in stop and not term.startswith(('@', '#'))]
        
        # Build co-occurrence matrix
        for i in range(len(terms_co)-1):            
            for j in range(i+1, len(terms_co)):
                w1, w2 = sorted([terms_co[i], terms_co[j]])                
                if w1 != w2:
                    com[w1][w2] += 1
                    
com_max = []
# For each term, look for the most common co-occurrent terms
for t1 in com:
    t1_max_terms = sorted(com[t1].items(), key=operator.itemgetter(1), reverse=True)[:5]
    for t2, t2_count in t1_max_terms:
        com_max.append(((t1, t2), t2_count))
# Get the most frequent co-occurrences
terms_max = sorted(com_max, key=operator.itemgetter(1), reverse=True)

In [10]:
print(terms_max[:20])

[(('Clooney', 'George'), 4879), (('Clooney', 'refugee'), 3686), (('George', 'refugee'), 3120), (('➖', '️'), 2962), (('➖', '⭕'), 2960), (('produce', 'world'), 2506), (('Clooney', 'hypocrite'), 2267), (('hypocrite', 'refugee'), 2259), (('You', 'terrorists'), 2235), (('Bin', 'Laden'), 2234), (('You', 'produce'), 2230), (('produce', 'terrorists'), 2229), (('fucking', 'idiots'), 2229), (('still', 'world'), 2229), (('You', 'world'), 2229), (('terrorists', 'world'), 2228), (('produce', 'still'), 2228), (('idiots', 'terrorists'), 2228), (('idiots', 'produce'), 2228), (('idiots', 'world'), 2228)]


### Vincent
Vincent bridges the gap between a Python back-end and a front-end that supports D3.js visualisation, allowing us to benefit from both sides. The tagline of Vincent is in fact “The data capabilities of Python.

In [11]:
import vincent
vincent.core.initialize_notebook() # to display chart / no need to save it as HTML

word_freq = count_only.most_common(20)
labels, freq = zip(*word_freq)
data = {'data': freq, 'x': labels}
bar = vincent.Bar(data, iter_idx='x')
bar.to_json(git_loc + '/Refugee_Project/Code/Output/term_freq.json')

#you could save the HTML template directly from Python with:
bar.to_json(git_loc + '/Refugee_Project/Code/Output/term_freq.json', 
            html_out=True, 
            html_path= git_loc + '/Refugee_Project/Code/Output/chart.html')

In [12]:
bar.axis_titles(x='Index', y='Value')

## Time series

In [13]:
import pandas
import json
 
dates_hashtag = []
# f is the file pointer to the JSON data set
for tweet_index in range(len(tweets_data)):
    tweet = tweets_data[tweet_index]
    # let's focus on hashtags only at the moment
    if 'text' in tweet.keys():
        terms_hash = [term for term in preprocess(tweet['text']) if term.startswith('#')]
        # track when the hashtag is mentioned
        if '#refugee' in terms_hash:
            dates_hashtag.append(tweet['created_at'])

# a list of "1" to count the hashtags
ones = [1]*len(dates_hashtag)
# the index of the series
idx = pandas.DatetimeIndex(dates_hashtag)
# the actual series (at series of 1s for the moment)
hashtag = pandas.Series(ones, index=idx)
 
# Resampling / bucketing
per_minute = hashtag.resample('30Min', how='sum').fillna(0)
#per_hour = hashtag.resample('60Min', how='sum').fillna(0)

the new syntax is .resample(...).sum()


In [14]:
time_chart = vincent.Line(hashtag)
time_chart.axis_titles(x='Time', y='Freq')
time_chart.to_json(git_loc + '/Refugee_Project/Code/Output/time_chart.json')

In [15]:
time_chart

### Sentiment Analysis with PMI

I will use other methods such as LSA (Latent Semantic Analysis), or classification ones

In [16]:
# Count terms only once, equivalent to Document Frequency
#terms_single = set(terms_all)

#count_stop_single = Counter()

n_docs = len(tweets_data)
# n_docs is the total n. of tweets
p_t = {}
p_t_com = defaultdict(lambda : defaultdict(int))
 
for term, n in count_only.items():#count_stop_single.items():
    p_t[term] = n / n_docs
    for t2 in com[term]:
        p_t_com[term][t2] = com[term][t2] / n_docs

In [17]:
positive_vocab = [
    'good', 'nice', 'great', 'awesome', 'outstanding',
    'fantastic', 'terrific', ':)', ':-)', 'like', 'love',
    # shall we also include game-specific terms?
    # 'triumph', 'triumphal', 'triumphant', 'victory', etc.
]
negative_vocab = [
    'bad', 'terrible', 'crap', 'useless', 'hate', ':(', ':-(',
    # 'defeat', etc.
]

In [18]:
import math

pmi = defaultdict(lambda : defaultdict(int))
for t1 in p_t:
    for t2 in com[t1]:
        denom = p_t[t1] * p_t[t2]
        pmi[t1][t2] = math.log2(p_t_com[t1][t2] / denom)

semantic_orientation = {}
for term, n in p_t.items():
    positive_assoc = sum(pmi[term][tx] for tx in positive_vocab)
    negative_assoc = sum(pmi[term][tx] for tx in negative_vocab)
    semantic_orientation[term] = positive_assoc - negative_assoc


In [19]:
semantic_sorted = sorted(semantic_orientation.items(), 
                         key=operator.itemgetter(1), 
                         reverse=True)

# get the top/flop 10
top_pos = semantic_sorted[:10]
top_neg = semantic_sorted[-10:]

In [20]:
top_pos

[('cool', 20.996254133081543),
 ('clapping', 20.988692254828496),
 ('Twit', 17.302578821898173),
 ('dang', 17.302578821898173),
 ('deed', 17.302578821898173),
 ('Austrians', 17.302578821898173),
 ('Bristol', 17.195583839360587),
 ('hometown', 15.073053678254631),
 ('revitalized', 15.073053678254631),
 ('Utica', 15.073053678254631)]

In [21]:
top_neg

[('Britain', -11.88994129218423),
 ('https://t.co/c129ftAfQC', -12.073053678254631),
 ('https://t.co/2V1GdBh0zS', -12.073053678254631),
 ('settle', -12.073053678254631),
 ('unarmed', -12.073053678254631),
 ('https://t.co/9UXKEIvLa7', -12.073053678254631),
 ('marines', -12.073053678254631),
 ('carriers', -12.073053678254631),
 ('corvettes', -12.073053678254631),
 ('When', -12.99086090374935)]

## Geolocation

In [22]:
geo_data = {
        "type": "FeatureCollection",
        "features": []
}

for tweet_id in range(len(tweets_data)):
    tweet = tweets_data[tweet_id]
    if 'coordinates' in tweet.keys():
        geo_json_feature = {
            "type": "Feature",
            "geometry": tweet['coordinates'],
            "properties": {
                "text": tweet['text'],
                "created_at": tweet['created_at']
            }
        }
    geo_data['features'].append(geo_json_feature) 

# Save geo data

with open(git_loc + '/Refugee_Project/Code/Output/geo_data.json', 'r') as f_output:
    f_output.write(json.dumps(geo_data, indent=4))

In [27]:
from IPython.display import IFrame
IFrame('map.html', width=700, height=350)