### Loading the Twitter Data
The Twitter data is collected via a Java/Spring Boot application and stored in ElasticSearch. Python has a nice package to load the data from ElasticSearch.

#### First import the elasticsearch module and make a connection to localhost:9200

In [17]:
from elasticsearch import Elasticsearch

es = Elasticsearch(['http://localhost:9200/'])

In [18]:
es

<Elasticsearch([{'port': 9200, 'host': 'localhost'}])>

#### elasticsearch.get()
with the elasticsearch.get() function you can extract your data by specifying the index, the doc_type and the id. Let's look at an example ID.

In [19]:
tweet = es.get(index="twitter", doc_type='tweet', id='rCXMCWEBibQav3Bh316K')

In [20]:
tweet

{'_id': 'rCXMCWEBibQav3Bh316K',
 '_index': 'twitter',
 '_source': {'postDate': '2018-01-18T15:04:37.002Z',
  'tweet': '{"created_at":"Thu Jan 18 15:04:36 +0000 2018","id":954006615258152960,"id_str":"954006615258152960","text":"RT @LucydLtd: Tomorrow, on January 17th Lucyd will be presenting at TNABC. We look forward to connecting with cryptocurrency leaders, and d\\u2026","source":"\\u003ca href=\\"http:\\/\\/twitter.com\\" rel=\\"nofollow\\"\\u003eTwitter Web Client\\u003c\\/a\\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":140654787,"id_str":"140654787","name":"Daniel S Wijaya","screen_name":"DSWijaya","location":"Bogor, Indonesia","url":null,"description":"#Bitcoin #CryptoCurrency #BitcoinTalk #Bounty #Followback #Blockchain #ManchesterUnited","translator_type":"none","protected":false,"verified":false,"followers_count":4292,"friends_count":4

##### That's just the information of 1 collected tweet...

#### Load all twitter data

In [21]:
tweets_all = es.search(index="twitter", body={"query": {"match_all": {}}, "from": 0, "size": 1000})

print("The total number of collected tweets is:", tweets_all['hits']['total'])

The total number of collected tweets is: 1006009


Create a list with dictionaries for all tweets

In [22]:
# Dictionary with the tweets
tweet_list = tweets_all['hits']['hits']

In [23]:
tweet_list[0:10]

[{'_id': 'YDY1HWEB9Wu5eUiggX2t',
  '_index': 'twitter',
  '_score': 1.0,
  '_source': {'postDate': '2018-01-22T09:31:41.357Z',
   'tweet': '{"created_at":"Mon Jan 22 09:31:40 +0000 2018","id":955372382843228162,"id_str":"955372382843228162","text":"Cryptocurrency Exchange OKCoin To Launch In South Korea #ethereum #crypto #btc https:\\/\\/t.co\\/Fk9vMoMp1f","source":"\\u003ca href=\\"https:\\/\\/coinspectator.com\\/\\" rel=\\"nofollow\\"\\u003eCoin Spectator\\u003c\\/a\\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":2276587183,"id_str":"2276587183","name":"Coin Spectator","screen_name":"coinspectator","location":"United Kingdom","url":"https:\\/\\/coinspectator.com","description":"Real-time tool for monitoring the latest cryptocurrency news and analysis from a wide range of trusted sources.","translator_type":"none","protected":false,"verified":fa

#### Lots of information but we only need the postdate and the tweet
For this we use regular expressions, to find the start and end position of these items.

In [24]:
# tweets is only the "real" tweet text
import re
tweets = []
for t in range(0,len(tweet_list)):
    tweet_all_info = tweet_list[t]['_source']['tweet']
    tweet_start = re.search('"text"', tweet_all_info).start()
    tweet_end = re.search('"source"', tweet_all_info).start()
    tweet_text = tweet_all_info[tweet_start+7:tweet_end]
    tweets.append(tweet_text)
print('The first 10 actual tweets stored in this new list are:' + '\n' + '\n' + str(tweets[0:10]))

The first 10 actual tweets stored in this new list are:

['"Cryptocurrency Exchange OKCoin To Launch In South Korea #ethereum #crypto #btc https:\\/\\/t.co\\/Fk9vMoMp1f",', '"RT @timothychou: Can an Algorithm Tell When Kids Are in Danger? https:\\/\\/t.co\\/ugTVCVfTF1 #DataScience #DataScientist #BigData #IoT #Internet\\u2026",', '"RT @simdaq: Let\'s chat. Join us on Telegram and get instant updates on the project: https:\\/\\/t.co\\/N00L0FQfQU \\n#ico #trading #waves #bitcoin\\u2026",', '"RT @mas_oyama_coin: Masutatsu Oyama crypto currency, Mas Oyama coin (MAS) Start Pre-sale on 15th Feb. Please check official website! https:\\u2026",', '"U.S. Rating Agency to Issue Bitcoin and Cryptocurrency Grades Wednesday https:\\/\\/t.co\\/5N9nBLXs7r via @BTCTN",', '"RT @justinsuntron: Good summary. https:\\/\\/t.co\\/itB8SuDbr3",', '"RT @CNN: The 1% grabbed 82% of all wealth created in 2017 https:\\/\\/t.co\\/fEHnwRDWdX https:\\/\\/t.co\\/qfbPXaiiFA",', '"RT @calvinonline1: NO CRYPTO BANNING..Th

In [25]:
len(tweets)

1000

#### So we've only loaded 1.000 tweets
The issue is that a batch size bust less or equal to 10.000. Therefore we need a trick to load 10k tweets every iteration and start the new iteration at the latest index.

In [43]:
# Because batch size must be less or equal to 10.000 we extract the tweets as follows...
doc = {
    'size' : 10000,
    'query': {
        'match_all' : {}
    }
}
tweet_data = es.search(index="twitter", doc_type='tweet', body=doc, scroll='1m')

#### Every iteration we store 10.000 tweets until a total number of 1 million.

In [44]:
iter = 0
tweets = []
times = []
dates = []
while iter < 100:

    # store the scroll_id to start the next iteration
    scroll = tweet_data['_scroll_id']

    tweet_list = tweet_data['hits']['hits']
    for t in range(0, len(tweet_list)):

        # the tweet text...
        tweet_all_info = tweet_list[t]['_source']['tweet']
        tweet_start = re.search('"text"', tweet_all_info).start()
        tweet_end = re.search('"source"', tweet_all_info).start()
        tweet_text = tweet_all_info[tweet_start + 7:tweet_end]
        tweets.append(tweet_text)

        # the dates...
        time = tweet_list[t]['_source']['postDate']
        date = time[0:10]
        times.append(time)
        dates.append(date)

    # update tweet_data, start at the stored scroll_id for the next 10k tweets.
    tweet_data = es.scroll(scroll_id=scroll, scroll='1m')
    iter = iter + 1

#### Now we have them all

In [45]:
len(tweets)

1000000

In [47]:
print(tweets[0:5])
print(times[0:5])
print(dates[0:5])

['"Cryptocurrency Exchange OKCoin To Launch In South Korea #ethereum #crypto #btc https:\\/\\/t.co\\/Fk9vMoMp1f",', '"RT @timothychou: Can an Algorithm Tell When Kids Are in Danger? https:\\/\\/t.co\\/ugTVCVfTF1 #DataScience #DataScientist #BigData #IoT #Internet\\u2026",', '"RT @simdaq: Let\'s chat. Join us on Telegram and get instant updates on the project: https:\\/\\/t.co\\/N00L0FQfQU \\n#ico #trading #waves #bitcoin\\u2026",', '"RT @mas_oyama_coin: Masutatsu Oyama crypto currency, Mas Oyama coin (MAS) Start Pre-sale on 15th Feb. Please check official website! https:\\u2026",', '"U.S. Rating Agency to Issue Bitcoin and Cryptocurrency Grades Wednesday https:\\/\\/t.co\\/5N9nBLXs7r via @BTCTN",']
['2018-01-22T09:31:41.357Z', '2018-01-22T09:31:41.363Z', '2018-01-22T09:31:41.855Z', '2018-01-22T09:31:42.362Z', '2018-01-22T09:30:59.854Z']
['2018-01-22', '2018-01-22', '2018-01-22', '2018-01-22', '2018-01-22']


#### Store in a dataframe and save it

In [49]:
import pandas as pd
twitter_data = pd.DataFrame({'timestamp': times,
                             'dates': dates,
                             'tweet': tweets})
twitter_data.to_pickle("twitter_data")

#### Load the earlier save dataframe

In [50]:
df = pd.read_pickle("twitter_data")

In [51]:
df.head()

Unnamed: 0,dates,timestamp,tweet
0,2018-01-22,2018-01-22T09:31:41.357Z,"""Cryptocurrency Exchange OKCoin To Launch In S..."
1,2018-01-22,2018-01-22T09:31:41.363Z,"""RT @timothychou: Can an Algorithm Tell When K..."
2,2018-01-22,2018-01-22T09:31:41.855Z,"""RT @simdaq: Let's chat. Join us on Telegram a..."
3,2018-01-22,2018-01-22T09:31:42.362Z,"""RT @mas_oyama_coin: Masutatsu Oyama crypto cu..."
4,2018-01-22,2018-01-22T09:30:59.854Z,"""U.S. Rating Agency to Issue Bitcoin and Crypt..."


#### Cleaning the data

In [53]:
# clean up the text: remove special characters and convert to lower characters
import re
remove_chrs = re.compile("[^A-Za-z0-9 ]+")

def clean_up_text(tweet):
    tweet = tweet.lower().replace("<br />", " ")
    return re.sub(remove_chrs, "", tweet.lower())

In [54]:
tweets_cleaned = []
for tweet in df['tweet']:
    cleaned_tweet = clean_up_text(tweet)
    tweets_cleaned.append(cleaned_tweet)

#### Get the indices of the tweets from the Word List

In [55]:
# import libraries
import pandas as pd
import numpy as np

In [56]:
# load words and vectors
words = np.load('words.npy')
words = words.tolist()

#### Split them to lists
This must be done to find the word indices and vectors

In [65]:
tweets_split_cleaned = []
for tweet_cleaned in tweets_cleaned:
    tweet_splitted = tweet_cleaned.split()
    tweets_split_cleaned.append(tweet_splitted)

In [66]:
lengths=[]
for t in tweets_split_cleaned:
    length = len(t)
    lengths.append(length)

In [68]:
max(lengths)

42

In [69]:
def turn_sentence_to_indices(sentence):
    indices = np.zeros(75, dtype='int32')
    for i in range(0,len(sentence)):
        try:
            indices[i] = words.index(sentence[i])
        except:
            indices[i] = 0
    return(indices)

In [70]:
tweet_indices = []
for tweet in tweets_split_cleaned:
    tweet_word_indices = turn_sentence_to_indices(tweet)
    tweet_indices.append(tweet_word_indices)

In [71]:
print(len(tweet_indices))
print(tweet_indices[0:5])

1000000
[array([    0,   644,     0,     5,  1784,     7,   140,   575,     0,
       72245, 60145,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0], dtype=int32), array([ 33223,      0,     87,     30,  12598,   1362,     62,   1814,
           33,      7,   3053,      0,      0,      0,      0, 202255,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,      0,      0,      0,      0,      0,
            0,      0,      0,   

In [72]:
tweet_indices = np.asarray(tweet_indices)

In [73]:
# save the indices
np.save("tweet_indices", tweet_indices)