### Process Tweets Worksheet

In [None]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import os
import numpy as np
from tqdm import tqdm_notebook
import multiprocessing as mp
import re
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [None]:
cores=mp.cpu_count()
print("Number of processors: ", cores)

In [None]:
# load raw data
df_raw = pd.DataFrame()

root = 'raw_data'
for y in tqdm_notebook(range(2008,2020)):
    for m in range(1,13):
        
        filename = 'raw_tweets_{}_{}.pkl'.format(y,m)
        filestring = os.path.join(root, filename)
        
        with open(filestring, 'rb') as filehandle:  
            # read the data as binary data stream
            df_slice = pickle.load(filehandle)
            
        df_raw = pd.concat([df_raw, df_slice], ignore_index=True)

df_raw.info()

#### user_id feature
The twitter screen_name has an associated twitter account number.  
The account number is numerical representation of the author.

In [None]:
# read in the seed list put it into a dictionary for faster lookups
df_seeds = pd.read_csv('seeds_061419.csv')

# strip the spaces
screens = [k.strip() for k in df_seeds['screen_name']]
df_seeds['screen_name'] = screens

#convert to dictionary for faster lookup
lookup = df_seeds.set_index('screen_name').T.to_dict('series')

In [None]:
# find user_id's
df_prc = pd.DataFrame()
uids = []
tids = []
for _,nrows in tqdm_notebook(df_raw.iterrows()):
    tids.append(nrows['tweet_id'])
    user = np.asscalar(lookup[nrows['screen_name'].strip()])
    uids.append(user)

df_prc['tweet_id'] = tids
df_prc['user_id'] = uids

df_prc.info()

In [None]:
# save to pickle as a precaution
with open('tweet_user_ids.pkl', 'wb') as filehandle:  
    # store the data
    pickle.dump(df_prc, filehandle)

#### Detect Language
The sentiment converter likely works best in English.  
Other languages may inject noise into the sentiment conversion.

In [None]:
# import exported function for multiprocessing
import langdet

In [None]:
# split giant dataframe into small pieces
df_tw = df_raw.copy()
df_split = np.array_split(df_tw, 64, axis=0)
len(df_split)

In [None]:
# detect languages and store
root = 'proc_data'
for i in tqdm_notebook(range(64)):
    
    df_proc = df_split[i]
    
    # create the multiprocessing pool
    pool = mp.Pool(cores)

    # process the tweet text
    with mp.Pool(cores) as pool:
        result = pool.map(langdet.langfind, df_proc['text'].iteritems())

    # close down the pool
    pool.close()
    
    # write result to column
    df_proc['lang'] = result
    
    filename = 'proc_lang_{}.pkl'.format(i)
    filestring = os.path.join(root, filename)
    
    # output to pickle
    with open(filestring, 'wb') as filehandle:  
        # store the data
        pickle.dump(df_proc, filehandle) 

In [None]:
df_proc.tail(10)

### Clean the Tweets
Some attempt to improve the language prior to sentiment conversion is made.
This effort is intended to reduce the sentiment noise.

In [None]:
def cleaner(text):
    # take out space after @ symbol
    clear = re.sub(r'@\s+',r'@', text)
    # take out usernames
    clear = re.sub(r'@([A-Za-z0-9_]+)','', clear)
    # take out space after # symbol
    clear = re.sub(r'#\s+',r'#', clear)
    # take out space after $ symbol
    clear = re.sub(r'$\s+',r'$', clear)
    # take out url
    clear = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','', clear) 
    # take out pics
    clear = re.sub(r'pic.twitter?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+','', clear)
    # take out the extra white spaces
    clear = re.sub(r' +', ' ', clear.strip())
    return clear

In [None]:
# test the cleaner
cleaner('@ threadreaderapp unraveled  message https://www.google.co.uk/amp/s/ ...pic.twitter.com/Hoqk6XMkgn # CourageAward')

In [None]:
# clean the dataframes, tag the English entries
for i in tqdm_notebook(range(64)):
    
    # input from pickle
    filename_in = 'proc_lang_{}.pkl'.format(i)
    filestring_in = os.path.join(root, filename_in)
    
    with open(filestring_in, 'rb') as filehandle:  
    # read the data as binary data stream
        df_eng = pickle.load(filehandle)
        
    # clean each text block
    english = []
    length = []
    cleantext = []
    for j,nrows in tqdm_notebook(df_eng.iterrows()):
        tag = 0
        if nrows['lang'] == 'en':
            tag = 1 
        clean = cleaner(nrows['text'])
        length.append(len(clean))
        cleantext.append(clean)
        english.append(tag)
    df_eng['english'] = english
    df_eng['length'] = length
    df_eng['cleantext'] = cleantext
    
    # output to pickle
    filename_out = 'proc_clean_{}.pkl'.format(i)
    filestring_out = os.path.join(root, filename_out)
    
    with open(filestring_out, 'wb') as filehandle:  
        # store the data
        pickle.dump(df_eng, filehandle)

In [None]:
df_eng['cleantext'].sample(20)

#### Tokenize the Clean Text

In [None]:
# tokenize the clean text

tknzr = TweetTokenizer()
for i in tqdm_notebook(range(64)):
    
    # input from pickle
    filename_in = 'proc_clean_{}.pkl'.format(i)
    filestring_in = os.path.join(root, filename_in)
    
    with open(filestring_in, 'rb') as filehandle:  
    # read the data as binary data stream
        df_clean = pickle.load(filehandle)
        
    # tokenize through dataframe 
    tok_list = []
    len_list = []

    for _,nrows in tqdm_notebook(df_clean.iterrows()):
        word_list = tknzr.tokenize(nrows['cleantext'])
        tok_list.append(word_list)
        len_list.append(len(word_list))
                    
    df_clean['tokens'] = tok_list
    df_clean['tok_count'] = len_list
        
    # output to pickle
    filename_out = 'proc_token_{}.pkl'.format(i)
    filestring_out = os.path.join(root, filename_out)
    
    with open(filestring_out, 'wb') as filehandle:  
        # store the data
        pickle.dump(df_clean, filehandle)

In [None]:
df_clean.sample(10)

#### lowercase tokens and remove stop words for Keywords

In [None]:
stoplist = list(set(stopwords.words('english'))) + \
            ['.',',','!','"','…','?','’',':','-','/',')','(','$','...','&','*',':)','%','”','“',"'",'+',';']

In [None]:
# remove stopwords from tokens and count keywords

for i in tqdm_notebook(range(64)):
    
    # input from pickle
    filename_in = 'proc_token_{}.pkl'.format(i)
    filestring_in = os.path.join(root, filename_in)
    
    with open(filestring_in, 'rb') as filehandle:  
    # read the data as binary data stream
        df_token = pickle.load(filehandle)
        
    # clear the stopwords
    bare_tok = []
    bare_cnt = []
    for _,nrows in tqdm_notebook(df_token.iterrows()):
        bare = []
        for Tok in nrows['tokens']:
            tok = Tok.lower()
            if not (tok in stoplist):
                bare.append(tok)
        bare_set = list(set(bare))
        bare_tok.append(bare_set)
        bare_cnt.append(len(bare_set))
        
    # add bare features to dataframe
    df_token['keywords'] = bare_tok
    df_token['key_count'] = bare_cnt
        
    # output to pickle
    filename_out = 'proc_keyword_{}.pkl'.format(i)
    filestring_out = os.path.join(root, filename_out)
    
    with open(filestring_out, 'wb') as filehandle:  
        # store the data
        pickle.dump(df_token, filehandle)

In [None]:
df_token.sample(10)

#### Sentiment Transform

In [None]:
# convert tweet to sentiment

for i in tqdm_notebook(range(64)):
    
    # input from pickle
    filename_in = 'proc_keyword_{}.pkl'.format(i)
    filestring_in = os.path.join(root, filename_in)
    
    with open(filestring_in, 'rb') as filehandle:  
    # read the data as binary data stream
        df_key = pickle.load(filehandle)
        
    # transform tweets to sentiment
    pos_list = []
    neu_list = []
    neg_list = []
    cmp_list = []
    analyzer = SentimentIntensityAnalyzer()

    for _,nrows in tqdm_notebook(df_key.iterrows()):
        vs = analyzer.polarity_scores(nrows['cleantext'])
        pos_list.append(vs['pos'])
        neu_list.append(vs['neu'])
        neg_list.append(vs['neg'])
        cmp_list.append(vs['compound'])

    df_key['vad_positive'] = pos_list
    df_key['vad_neutral'] = neu_list
    df_key['vad_negative'] = neg_list
    df_key['vad_compound'] = cmp_list
        
    # output to pickle
    filename_out = 'proc_sent_{}.pkl'.format(i)
    filestring_out = os.path.join(root, filename_out)
    
    with open(filestring_out, 'wb') as filehandle:  
        # store the data
        pickle.dump(df_key, filehandle)

In [None]:
df_key.sample(10)

#### Tag Outliers
The English entries were tagged as 1's for good behavior.   
By contrast the Outliers are tagged as 0's.  
On filtering, good behavior filters are the 1's.

In [None]:
# load selected poets - found in analytics worksheet
with open('screened_seeds_06119.pkl', 'rb') as filehandle:  
    # store the data as binary data stream
    selected = pickle.load(filehandle)

In [None]:
# keyword context checked against lower case keywords
context = ['bitcoin', '#bitcoin', 'btc', 'bitcoins']

In [None]:
# Add the Outlier Tags
# tokenize the clean text
tknzr = TweetTokenizer()
for i in tqdm_notebook(range(64)):
    
    # input from pickle
    filename_in = 'proc_sent_{}.pkl'.format(i)
    filestring_in = os.path.join(root, filename_in)
    
    with open(filestring_in, 'rb') as filehandle:  
    # read the data as binary data stream
        df_sent = pickle.load(filehandle)
        
    # transform tweets to sentiment
    poet_ok = []
    tweet_ok = []
    sent_ok = []
    keyword_ok = []

    for _,nrows in tqdm_notebook(df_sent.iterrows()):

        # check poet
        p_flag = 0
        if nrows['screen_name'] in selected:
            p_flag = 1
        
        #check tweet isn't short
        t_flag = 0
        if nrows['key_count'] > 3:
            t_flag = 1
        
        #check sentiment
        s_flag = 0
        if ((nrows['vad_compound']!=0)&(nrows['vad_neutral']!=1)):
            s_flag = 1
        
        #check keywords
        k_flag = 0
        hits = [k for k in nrows['keywords'] if k in context]
        if len(hits)>0:
            k_flag = 1
        
        poet_ok.append(p_flag)
        tweet_ok.append(t_flag)
        sent_ok.append(s_flag)
        keyword_ok.append(k_flag)

    df_sent['poet_ok'] = poet_ok
    df_sent['tweet_ok'] = tweet_ok
    df_sent['sent_ok'] = sent_ok
    df_sent['key_ok'] = keyword_ok
        
    # output to pickle
    filename_out = 'proc_out_{}.pkl'.format(i)
    filestring_out = os.path.join(root, filename_out)
    
    with open(filestring_out, 'wb') as filehandle:  
        # store the data
        pickle.dump(df_sent, filehandle)

In [None]:
df_sent.sample(25)

#### Build TSI 

In [None]:
# import processed dataframe
df_processed = pd.DataFrame()
for i in tqdm_notebook(range(64)):
    
    # input from pickle
    filename_in = 'proc_out_{}.pkl'.format(i)
    filestring_in = os.path.join(root, filename_in)
    
    with open(filestring_in, 'rb') as filehandle:  
    # read the data as binary data stream
        df_slice = pickle.load(filehandle)
        
    df_processed = pd.concat([df_processed, df_slice], ignore_index=True)
    
df_processed.info()  

In [None]:
# slice using the filters
df_ind = df_processed[(df_processed['poet_ok']==1)&(df_processed['tweet_ok']==1)\
                      &(df_processed['sent_ok']==1)&(df_processed['key_ok']==1)].copy()
df_ind.info()

In [None]:
# index subset
df_index = df_ind[['date','vad_compound']].copy()
df_index.head(3)

In [None]:
#sort dates ascending
df_index = df_index.sort_values('date')
df_index.head()

In [None]:
#aggregate by days
df_index = df_index.set_index('date').groupby(pd.Grouper(freq='D')).mean()
df_index.head()

In [None]:
#clip out the early NaN's
df_index['2014-01-01':].info()

In [None]:
df_tsi = df_index['2014-01-01':].copy()
plt.plot(df_tsi.index, df_tsi['vad_compound'])
plt.show

In [None]:
plt.figure(figsize=(20,10))
plt.plot(df_tsi['2019-01-01':].index, df_tsi['vad_compound']['2019-01-01':])
plt.show

In [None]:
df_tsi.head()

In [None]:
# export the indicator
df_tsi.to_csv('tsi002.csv', index=True)

In [None]:
df_tsi.info()

#### This concludes the manual ETL process