In [None]:
#-----------------------------------------------------------
# Use Twitter Data to Forcast Unemployment Rates
#-----------------------------------------------------------
__author__ = 'Luzius von Gunten'
#-----
#Description:

#-----

#1. Import Packages**

In [1]:
#++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
#non-standard packages originating from other develogppers:
#---
#:::TwitterScraper:::
#by Tom Dickinson, some changes by Luzius von Gunten
#https://github.com/tomkdickinson/Twitter-Search-API-Python/blob/master/TwitterScraper.py
import searchTwi as st 
#---
#standard packages:
#---
from calendar import monthrange
from datetime import date
from monthdelta import monthdelta
from monthdelta import monthmod
import numpy as np
import csv
import pandas as pd
import datetime
import re
import codecs
import json
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

#2. Get Data

##2.1. Functions to scrape twitter data by twitter advanced search

In [2]:
def QueryListBuilder(keywords, location_ref):
    '''
    build the queries for extracting twitter messages, crossproduct of the two parameters is stored in a list
    parmas:
    - keywords: list of search terms
    - location_ref: list of terms which refer to locations, such as 'schweiz'
    '''
    return [k + l for l in location_ref for k in keywords]

def RunQueries(querylist,max_tweet,from_y,from_m,to_y,to_m):
    '''
    - executes twitter message search on basis twitter expert search:
        https://github.com/tomkdickinson/Twitter-Search-API-Python/blob/master/TwitterScraper.py
    - within each month between start and end month, each query in the list of queries is executed separatly, with 
      a stated maximum of tweets to be extracted per query and month
    - parmas:
        - querylist: list of search terms --> use 'QueryListBuilder'
        - max_tweet: maximum of tweets to be extracted per query and month
        - from_y/from_m: start year/month
        - to_y/to_m: end year/month
    '''    
    startMonth = date(from_y,from_m,1)
    currentMonth = startMonth
    endMonth = date(to_y,to_m,1)
    i = 0
    tweets = []
    nTweetsPerMonth = []
    while currentMonth <> endMonth:
        currentMonth = startMonth + monthdelta(i)
        firstDayInMonth = str(currentMonth)
        lastDayInMonth = str(date(currentMonth.year,currentMonth.month,monthrange(currentMonth.year,currentMonth.month)[1]))
        print "--- %s ---" % str(currentMonth)
        monthlyTweets = []
        for q in querylist:
            query = q + ' since:' + firstDayInMonth + ' until:' + lastDayInMonth
            search1 = st.TwitterSearchImpl(0, 5, max_tweet)
            tweetsThisRound = search1.search(query)
            
            '''
            dat = datetime.datetime.fromtimestamp((t['created_at']/1000))
            fmt = "%Y-%m-%d"
            print "%s --- %s" %(dat.strftime(fmt), t['text'])
            '''
            monthlyTweets.extend(tweetsThisRound)
        nTweetsPerMonth.append(len(monthlyTweets))
        print'number of tweets this month: %d' % len(monthlyTweets)
        tweets.extend(monthlyTweets)
        i = i + 1
    print "\n--- \ntotal number of tweets retrieved: %d" % len(tweets)
    print "mean number of tweets per month: %d" %np.mean(nTweetsPerMonth)
    return tweets

##2.2. Functions to do basic data management

In [6]:
def cleaner(data, textField = 'text', deepCleaning = True):
    '''
    Removes new Line characters (\n), Twitter usernames, URLs, and special characters via regex
    Expects a list of dictionaries as an input, same data structure as output
    params:
    - data: list of dictionaries, each listelement is a dictionary with relevant fields (key) of information (value) per tweet
    - textField: nominates the key where the corresponding value contains the tweet text
    - clean: boolean, wether tweet text should be cleaned before sentimentanalysis (removes usernames, weblinks, new line tags and special characters)
    '''
    newLine = re.compile(r"\n|\r")
    urls = re.compile('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+=]|[!*\(\),\?]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    usernames = re.compile("@[\w_]*")
    specialChars1 = re.compile(r"[/_\-\\]")
    specialChars2 = re.compile(r"[!<>«»“”#@:;.,.%()/'…\[\]\?\"\*]")
    for t in data:
        t['cleanedText'] = newLine.sub(' ',t[textField]) 
        if  deepCleaning == True: 
            t['cleanedText'] = urls.sub(' ',t['cleanedText']) 
            t['cleanedText'] = usernames.sub(' ',t['cleanedText']) 
            t['cleanedText'] = specialChars1.sub(' ',t['cleanedText']) 
            t['cleanedText'] = specialChars2.sub('',t['cleanedText']).lower()
    return data

def keepFieldsInListOfDicts(keepFields,ListOfDicts):
    data = [{n: t[n] for n in keepFields} for t in ListOfDicts]
    return data

def exporter(outfile,tweets, keepFields):
    '''
    stores extracted tweets as a json dump and for human readability in a csv-file
    params:
    - outfile = file to be writtein
    - tweets = list of dictionaries with tweets
    '''
    #only keep wanted data fields
    data = keepFieldsInListOfDicts(keepFields = keepFields, ListOfDicts = tweets)
    
    #save as json-file
    with open('outdata/'+outfile+'.json', 'w') as fileout:
        json.dump(data, fileout)
    
    #save as csv via pandas dataframe
    df = pd.DataFrame(data)
    df.to_csv('outdata/'+outfile+".csv", sep=';',quoting=csv.QUOTE_NONNUMERIC, encoding='utf8')
    
    print '\n--- \nDone. %d saved to disk.' %len(tweets)

def getFromJSONDumpFile(jfile):
    with open(jfile) as infile: 
        dat = json.load(infile)
    return dat


##2.3. Functions to do lexicon based sentiment analysis

In [4]:
def readSentiWS():
    '''
    reads SentiWS files (german dictionary for sentiment analysis) and outputs a dictionary 
    with words enlisted in SentiWS (key) and sentiment values (value)
    http://asv.informatik.uni-leipzig.de/download/sentiws.html 
    '''
    Neg = 'sentiment dict/SentiWS_v1.8c_Negative.txt' 
    Pos = 'sentiment dict/SentiWS_v1.8c_Positive.txt' 
    sentis = {}
    for file in [Neg, Pos]:
        f = codecs.open(file, encoding='utf-8')
        newLine = re.compile(r'\n')
        pos = re.compile(r'\|\w+')
        for line in f:
            line = newLine.sub('',line) 
            line = pos.sub('',line) 
            split = re.split(',|\t',line)
            val = float(split[1])
            del split[1]
            for key in split:
                sentis[key.lower()]=val
        f.close()
    return sentis

def readGerPolClues():
    '''
    reads German Polarity Clues files (german dictionary for sentiment analysis) and outputs a dictionary 
    with words enlisted in German Polarity Clues (key) and sentiment values (value)
    http://www.ulliwaltinger.de/sentiment/
    '''
    Neg = 'sentiment dict/GermanPolarityClues-Negative-21042012.tsv' 
    Pos = 'sentiment dict/GermanPolarityClues-Positive-21042012.tsv'
    Neut = 'sentiment dict/GermanPolarityClues-Neutral-21042012.tsv'
    sentis = {}
    for file in [Neg, Pos, Neut]:
        f = codecs.open(file, encoding='utf-8')
        newLine = re.compile(r'\n')
        for line in f:
            line = newLine.sub('',line) 
            split = re.split('\t',line)
            sentis[split[0].lower()]=split[3]
        f.close()
    return sentis

def getSenti_sentiWS(lex, keys, tweet):
    '''
    get sentiment features for tweets based on SentiWS lexicon
    params:
    - lex: SentiWS lex as dictionary (key=words in lex, value = polarity of word), dict, output from "readSentiWS()"
    - keys: words in SentiWS lex (lex.keys()), str
    - tweet: tweet text, str
    '''
    tokens = filter(None, [x for x in tweet.split(' ')]) #tokenize tweets, filter empty strings (blanks)
    matches = list(set(keys) & set(tokens)) #match tweet tokens and words in SentiWS-Lex
    tweetFeatures = {}
    if matches:
        polarities = [lex[key] for key in matches] #get polarities from SentiWS for matched tweet tokens
        tweetFeatures["sentiWS_nbrNeg"] = sum(1 for n in polarities if n < 0)
        tweetFeatures["sentiWS_nbrPos"] = sum(1 for n in polarities if n > 0)
        tweetFeatures["sentiWS_mean"] = np.mean(polarities)
        tweetFeatures["sentiWS_polarities"] = polarities
        if tweetFeatures["sentiWS_mean"] < 0:
            tweetFeatures["sentiWS_sentiVal"] = 'negative'
        elif tweetFeatures["sentiWS_mean"] > 0:
            tweetFeatures["sentiWS_sentiVal"] = 'positive'
        else:
            tweetFeatures["sentiWS_sentiVal"] = 'neutral'
    else:
        tweetFeatures = {"sentiWS_nbrNeg":0, "sentiWS_nbrPos":0, "sentiWS_mean":0, "sentiWS_polarities":[],"sentiWS_sentiVal":'neutral'}

    return tweetFeatures

def getSenti_GPC(lex, keys, tweet):
    '''
    get sentiment features for tweets based on German Polarity Clues (GPC) lexicon
    params:
    - lex: GPC lex as dictionary (key=words in lex, value = polarity of word), dict, output from "readGerPolClues()"
    - keys: words in GPC lex (lex.keys()), str
    - tweet: tweet text, str
    '''
    tokens = filter(None, [x for x in tweet.split(' ')]) #tokenize tweets, filter empty strings (blanks)
    matches = list(set(keys) & set(tokens)) #match tweet tokens and words in SentiWS-Lex
    tweetFeatures = {}
    if matches:
        polarities = [lex[key] for key in matches] #get polarities from SentiWS for matched tweet tokens
        tweetFeatures["GPC_nbrNeg"] = sum(1 for n in polarities if n == 'negative')
        tweetFeatures["GPC_nbrPos"] = sum(1 for n in polarities if n == 'positive')
        tweetFeatures["GPC_nbrNeut"] = sum(1 for n in polarities if n == 'neutral')
        tweetFeatures["GPC_polarities"] = polarities
        if tweetFeatures["GPC_nbrNeg"] > tweetFeatures["GPC_nbrPos"]:
            tweetFeatures["GPC_sentiVal"] = 'negative'
        elif tweetFeatures["GPC_nbrNeg"] < tweetFeatures["GPC_nbrPos"]:
            tweetFeatures["GPC_sentiVal"] = 'positive'
        else: 
            tweetFeatures["GPC_sentiVal"] = 'neutral'
    else:
        tweetFeatures = {"GPC_nbrNeg":0, "GPC_nbrPos":0, "GPC_nbrNeut":0, "GPC_polarities":[],"GPC_sentiVal":'neutral'}

    return tweetFeatures




#test run
#RawTweets = RunQueries(querylist = ['arbeitsmarkt schweiz'], max_tweet=10, from_y=2012, from_m=1, to_y=2012, to_m=4)

##2.4. Function calls to collect and process raw tweets

In [6]:
'''
### build queries
keywords = ['arbeitslosigkeit','wachstum','arbeitsmarkt','standort','stellenmarkt','werkplatz','stellenabbau',
            'stellenausbau', 'firmenschliessung', 'standortverlagerung','arbeitsplatzabbau', 'personalabbau',
            'rationalisierungsmassnahme', 'restrukturierung','erwerbslosigkeit', 'betriebsschliessung',
            'beschäftigung', 'beschäftigte','beschäftigten',
            'arbeitsplätze','arbeitsplätze',
            'arbeitslose','arbeitslosen',
            'stellensuchende','stellensuchenden',
            'angestellten','angestellte',
            'baut stellen ab', 'schafft neue stellen']
location_ref = [' schweiz -#job -#jobs']

#keywords=['arbeit']
#location_ref = [' schweiz']

Queries = QueryListBuilder(keywords = keywords, location_ref = location_ref )

### get & clean tweets
RawTweets = RunQueries(querylist = Queries, max_tweet=200, from_y=2012, from_m=1, to_y=2016, to_m=10)
keep = ['text','created_at','tweet_id','retweets','favorites','user_name']
exporter(outfile = "outputRaw_v4", tweets = RawTweets, keepFields = keep)

RawTweets = getFromJSONDumpFile('outdata/outputRaw_v4.json') #reload saved tweets in order to keep only wanted fields
Tweets = cleaner(RawTweets)

### get sentiment dictionaries & determine tweet sentiment
sentiWS = readSentiWS()
sentiWSkeys = sentiWS.keys()
gpc = readGerPolClues()
gpcKeys = gpc.keys()
for t in Tweets:
    feat1 = getSenti_sentiWS(lex = sentiWS, keys = sentiWSkeys, tweet = t['cleanedText'])
    feat2 = getSenti_GPC(lex = gpc, keys = gpcKeys, tweet = t['cleanedText'])
    t.update(feat1); t.update( feat2)

### export tweets to file
keep = Tweets[0].keys()
exporter(outfile = "Tweets_v4", tweets = RawTweets, keepFields = keep)
'''


'\n### build queries\nkeywords = [\'arbeitslosigkeit\',\'wachstum\',\'arbeitsmarkt\',\'standort\',\'stellenmarkt\',\'werkplatz\',\'stellenabbau\',\n            \'stellenausbau\', \'firmenschliessung\', \'standortverlagerung\',\'arbeitsplatzabbau\', \'personalabbau\',\n            \'rationalisierungsmassnahme\', \'restrukturierung\',\'erwerbslosigkeit\', \'betriebsschliessung\',\n            \'besch\xc3\xa4ftigung\', \'besch\xc3\xa4ftigte\',\'besch\xc3\xa4ftigten\',\n            \'arbeitspl\xc3\xa4tze\',\'arbeitspl\xc3\xa4tze\',\n            \'arbeitslose\',\'arbeitslosen\',\n            \'stellensuchende\',\'stellensuchenden\',\n            \'angestellten\',\'angestellte\',\n            \'baut stellen ab\', \'schafft neue stellen\']\nlocation_ref = [\' schweiz -#job -#jobs\']\n\n#keywords=[\'arbeit\']\n#location_ref = [\' schweiz\']\n\nQueries = QueryListBuilder(keywords = keywords, location_ref = location_ref )\n\n### get & clean tweets\nRawTweets = RunQueries(querylist = Queries, max

#3. Create time series 

##3.1 Aggregate functions

In [7]:
def countTweetSentiments(data,countVar, byVar, prefixOut):
    '''
    aggregates categorial data in long-format to counts by category in wide format
    params:
    - data: pandas data frame
    - countVar: categorial data to compute counts by category
    - byVar: variable to aggregate on, comes back as index
    - prefixOut: counted category labels come back as column labels with this prefix
    '''
    grpd = pd.DataFrame(data.groupby([byVar,countVar]).size().reset_index())
    grpd.columns = [byVar,countVar,'count']
    reshaped = grpd.pivot(index=byVar, columns=countVar, values='count')
    reshaped.columns = [prefixOut +'_'+ s for s in list(reshaped.columns.values)]
    return reshaped

def proportions(data,shares,outLabels):
    '''
    sums up stated columns and computes the share of the sum of each column
    params:
    - data: pandas data frame
    - shares: columns to compute shares
    - outLabels: labels of new cols, which hold the shares
    '''
    Sum = data[shares].sum(axis=1)
    for i in range(0,len(shares)):
        data[outLabels[i]] = data[shares[i]]/Sum
    return data

##3.2 Aggregate

In [48]:
#load to data frame
TweetsDF = pd.DataFrame(getFromJSONDumpFile('outdata/Tweets_v4.json')) #reload to make this chapter independent

#prepare features
TweetsDF['monat'] = TweetsDF['created_at'].str[:7]
TweetsDF['match'] = TweetsDF['sentiWS_sentiVal'] == TweetsDF['GPC_sentiVal'] #chek match of different senti lexicons
TweetsDF['match'] = TweetsDF['match'].astype(str)


#aggregate
agg1 = countTweetSentiments(data=TweetsDF,countVar='sentiWS_sentiVal', byVar='monat', prefixOut='NbrSentiWS')
agg1 = proportions(data=agg1,shares=['NbrSentiWS_positive','NbrSentiWS_negative','NbrSentiWS_neutral'],
                   outLabels=['PropSentiWS_positive','PropSentiWS_negative','PropSentiWS_neutral'])
agg2 = countTweetSentiments(data=TweetsDF,countVar='GPC_sentiVal', byVar='monat', prefixOut='NbrGPC')
agg2 = proportions(data=agg2,shares=['NbrGPC_positive','NbrGPC_negative','NbrGPC_neutral'],
                   outLabels=['PropGPC_positive','PropGPC_negative','PropGPC_neutral'])
agg3 = TweetsDF[['monat','sentiWS_mean']].groupby('monat').mean().rename(columns={'sentiWS_mean':'sentiWS_meanTot'})
agg4 = TweetsDF[['monat','sentiWS_mean']].groupby('monat').sum().rename(columns={'sentiWS_mean':'sentiWS_sumTot'})
agg5 = countTweetSentiments(data=TweetsDF,countVar='match', byVar='monat', prefixOut='Match')
agg5 = proportions(data=agg5,shares=['Match_True','Match_False'],
                   outLabels=['PropMatch_True','PropMatch_False'])

TweetsDFAggMonth = pd.concat([agg1,agg3,agg4,agg2,agg5], axis=1)

##3.3 Combine with unemployment time series and save

In [49]:
#get unemployment data
unemployment = pd.read_csv('amstat_daten AL/data_arbeitslosigkeit.csv', sep=';').set_index('monat')

#match with twitter data
TweetsAgg_TimeSeries_v4 = pd.concat([TweetsDFAggMonth,unemployment], axis=1)

#save 
TweetsAgg_TimeSeries_v4.to_pickle('outdata/TweetsAgg_TimeSeries_v4.pkl')

#XX Analysis

##XX.1 Helper functions

In [2]:
def completeSeriesDay(incompleteSerie,valueLabel, start='2012-01-01', end='2016-10-31'):
    '''
    when collecting Twitter data, you may not get data for every day. this function completes time series.
    returns a serie object
    params:
    - imcompleteSeries: Data frame or Serie with datetime index but not every day included, data cols are filled with 0 if missing value
    - valueLabel: any string
    '''
    index = pd.date_range(start, end)
    series = pd.Series(index=index)
    comb = pd.concat([series,incompleteSerie], axis=1)
    comb.drop(comb.columns[0], axis=1, inplace=True) 
    comb.columns = [valueLabel]
    comb.fillna(0, inplace=True)
    comb = pd.Series(data=comb[valueLabel], index=index)
    return comb

##XX.2 Data quality

In [7]:
# load data 
TweetsDF = pd.DataFrame(getFromJSONDumpFile('outdata/Tweets_v4.json')) #reload to make this chapter independent
TweetsAgg_TimeSeries_v4 = pd.read_pickle('outdata/TweetsAgg_TimeSeries_v4.pkl') #reload to make the chapter independent


#------------
#plot number of tweets per day
TweetsDF['date'] = pd.to_datetime(TweetsDF['created_at'])
TweetsPerDay = pd.Series(TweetsDF.groupby('date').size())
combo = completeSeriesDay(incompleteSerie=TweetsPerDay,
                          valueLabel='NumberOfTweets')

figone= plt.figure(1)
plt.yticks(size=7)
plt.xticks(rotation=90, size=7)
plt.title('Number of Tweets collected per Day', size=20)
plt.plot(combo.index, combo,'-')
#plt.show()
figone.savefig('outfigs/Anzahl_Tweets_Pro_Tag.png')


#------------
#plot mean and max daily number of tweets per day by month and totals sum of tweets per month
mean = combo.resample(rule='M', how='mean')
max = combo.resample(rule='M', how=np.max)
sum = combo.resample(rule='M', how=np.sum)
dat = max.index

fig = plt.figure(2)
ax = fig.add_subplot(1,1,1)  
plt.yticks(size=7)
plt.xticks(rotation=90, size=7)
plt.title('Number of Tweets collected per Month', size=20)
plt.plot(dat,mean,'r-', label='Mean Daily Nbrs of Tweets per Month')
plt.plot(dat,sum,'g-', label='Sum of Tweets per Month')
plt.plot(dat,max,'g-', label='Max Daily Nbrs of Tweets per Month')
plt.grid
plt.legend(loc='upper right',prop={'size':10})
ax.xaxis.set_major_locator(mdates.MonthLocator(bymonth=[1,4,7,10]))
#ax.xaxis.set_major_locator(mdates.MonthLocator(interval=12))
ax.set_yticks(np.arange(0,300,10))
ax.grid(which='major')
ax.xaxis.set_major_formatter(mdates.DateFormatter('%Y-%m')) 
#plt.show()
fig.savefig('outfigs/Anzahl_Tweets_Pro_Monat_SumMeanMax.png')


#TO DO

In [None]:

# aggregierten auf Monat:
    # pro Monat: tot nbr of senti-vals,  Anteil neg berechnen, jeweils für SentiWS un GPC / mean senti für SentiWS

# zusammengesetzte wörter bei sentimentanalyse zerlegen: Stellenabbau --> stellen abbau
#random auswahl der tweets gewährleisten? bisher: neuste werden zuerst abgerufen
#build new twitter features: number of hashtags, word n-grams, negations, emoticons, elongated words, all caps

#TRY & ERROR

In [30]:

d = {'a':[1,2,3,4],
     'b':[10,20,30,40],
    'c':['x','x','y','y']}
test = pd.DataFrame(data= d)

#data=test
#shares=['a','b']
#outLabels=['a_ant','b_ant']

       
test = proportions(data=test,shares=['a','b'],outLabels=['a_ant','b_ant'])

print test

   a   b  c     a_ant     b_ant
0  1  10  x  0.090909  0.909091
1  2  20  x  0.090909  0.909091
2  3  30  y  0.090909  0.909091
3  4  40  y  0.090909  0.909091
