# 1. Importing the Dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
dataset_path = '../../dataset/'

In [3]:
df_news = pd.read_csv(dataset_path + '/news/news [only_date].csv')
df_price = pd.read_csv(dataset_path + '/price/btc_usd_daily.csv')

# 2. Preprocessing

In [4]:
# Check whether each element in the df_news matches an element in the df_price
df_news[df_news.date.isin(df_price.date)].shape

(17047, 3)

In [5]:
# Remove the unncessary features
df_new_price = df_price.drop(['close', 'high', 'low', 'change', 'volume', 'market_cap'], axis=1)

In [6]:
# Calculate the difference in opening prices between the following and current day.
df_diff_price = df_new_price.set_index('date').diff(periods=1)
df_diff_price['date'] = df_diff_price.index
df_diff_price = df_diff_price.reset_index(drop=True)

In [7]:
df_diff_price = df_diff_price.rename(columns={'open': 'change_in_open'})

In [8]:
df_diff_price.head()

Unnamed: 0,change_in_open,date
0,,2020-03-24
1,-605.27,2020-03-23
2,354.19,2020-03-22
3,20.96,2020-03-21
4,-14.87,2020-03-20


In [9]:
# Remove top row since it has a null value
df_diff_price = df_diff_price[df_diff_price['change_in_open'].notnull()]
df_diff_price.head()

Unnamed: 0,change_in_open,date
1,-605.27,2020-03-23
2,354.19,2020-03-22
3,20.96,2020-03-21
4,-14.87,2020-03-20
5,-946.23,2020-03-19


In [10]:
# Create a list of the opening prices and their corresponding daily headlines from the news
changes = []
headlines = []

for row in df_diff_price.iterrows():

    daily_headlines = []
    
    date = row[1]['date']
    change_in_open = row[1]['change_in_open']
    
    changes.append(change_in_open)

    for news_row in df_news[df_news['date'] == date].iterrows():
        headline = news_row[1]['headline']
        daily_headlines.append(headline)
        
    # Track progress
    headlines.append(daily_headlines)

# 3. Data Cleaning 

In [11]:
import re

from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [12]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

In [13]:
def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

In [14]:
def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
    
    # Format words and remove unwanted characters
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'0,0', '00', text) 
    text = re.sub(r'[_"\-;%()|.,+&=*%.,!?:#@\[\]]', ' ', text)
    text = re.sub(r'\'', ' ', text)
    text = re.sub(r'\$', ' $ ', text)
    text = re.sub(r'j k ', ' jk ', text)
    text = re.sub(r' s ', ' ', text)
    text = re.sub(r' yr ', ' year ', text)
    text = re.sub(r' l g b t ', ' lgbt ', text)
    
    # Optionally, remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    return text

In [15]:
stemmer = PorterStemmer()

In [16]:
# Clean the headlines
cleaned_headlines = []

for daily_headlines in headlines:
    cleaned_daily_headlines = []
    for headline in daily_headlines:
        cleaned_daily_headlines.append(clean_text(headline))
    cleaned_headlines.append(cleaned_daily_headlines)

In [17]:
cleaned_headlines[10]

['rare bitcoin price chart pattern may cryptocurrency’s last hope',
 'want sell bitcoin btc $ 6k charts show could ultimate bottom',
 'bitcoin price drops $ 3 637 rebounds $ 5 200 within minutes',
 'don’t panic previously impervious support could save bitcoin',
 'bitcoin crashes $ 7 000 peter schiff calls btc sinking ship',
 'nouriel roubini says bitcoin btc zero hedge value price crashes $ 5k',
 'analyst “least riskiest” time buy bitcoin',
 'bitcoin btc trader mati greenspan sells atcoins',
 'top analyst claims bitcoin bottom close here’s trend watch',
 'crypto king barry silbert says buys bitcoin btc google trends show alone',
 'trading legend peter brandt called worst bitcoin btc price crash since 2013',
 'bitcoin decent full hedge fundstrat’s lee',
 'bitcoin $ 1k possible warns veteran trader peter brandt',
 'mike novogratz says confidence bitcoin btc evaporated',
 'bitcoin carving short term bottom near $ 4k massive 50 decline',
 'first time i’ve wanted buy bitcoin btc edward snow

# 4. Word Vectors

In [18]:
def print_dictionary(dictionary, n=5):
    i = 0
    keys = dictionary.keys()
    keys = list(keys)
    while (i < len(keys)) and (i < n):
        key = keys[i]
        value = dictionary[key]
        print("'{}': {}".format(key, value))
        i+=1

**- Vocabulary**

In [19]:
# Find the number of times each word was used and the size of the vocabulary
word_counts = {}

for daily_headlines in cleaned_headlines:
    for headline in daily_headlines:
        for word in headline.split():
            if word not in word_counts:
                word_counts[word] = 1
            else:
                word_counts[word] += 1

print("the size of the vocabulary is {}.".format(len(word_counts)))

the size of the vocabulary is 16089.


In [20]:
print_dictionary(word_counts)

'bitcoin': 9020
'risks': 54
'falling': 31
'$': 3361
'2': 194


**- Pre-trained Word Embeddings**

In [21]:
# Load Glove's embeddings
glove_word_vectors = {}

with open('./glove.840B.300d.txt', encoding='utf-8') as file:
    for line in file:
        values = line.split(' ')
        
        word = values[0]
        word_vector = np.asarray(values[1:], dtype='float32')
        
        glove_word_vectors[word] = word_vector

In [22]:
word_vector = glove_word_vectors['hello']
word_vector.size

300

In [23]:
# Find the number of words that are missing from GloVe, 
# and are used more than our threshold.

missing_words = 0
threshold = 100

for word, count in word_counts.items():
    if count > threshold:
        if word not in glove_word_vectors:
            missing_words += 1
            
missing_ratio = round(missing_words/len(word_counts),4)*100
            
print("Number of words missing from GloVe:", missing_words)
print("Percent of words that are missing from vocabulary: {}%".format(missing_ratio))

Number of words missing from GloVe: 11
Percent of words that are missing from vocabulary: 0.06999999999999999%


In [24]:
# Limit the vocab that we will use to words that appear ≥ threshold 
# or are in GloVe

#dictionary to convert words to integers
vocab_to_int = {} 

value = 0
for word, count in word_counts.items():
    if count >= threshold or word in glove_word_vectors:
        vocab_to_int[word] = value
        value += 1

In [25]:
# Special tokens that will be added to our vocab
codes = ["<UNK>","<PAD>"]   

# Add codes to vocab
for code in codes:
    vocab_to_int[code] = len(vocab_to_int)

In [26]:
# Dictionary to convert integers to words
int_to_vocab = {}
for word, value in vocab_to_int.items():
    int_to_vocab[value] = word

In [27]:
print_dictionary(vocab_to_int)

'bitcoin': 0
'risks': 1
'falling': 2
'$': 3
'2': 4


In [28]:
usage_ratio = round(len(vocab_to_int) / len(word_counts),4)*100

print("Total Number of Unique Words:", len(word_counts))
print("Number of Words we will use:", len(vocab_to_int))
print("Percent of Words we will use: {}%".format(usage_ratio))

Total Number of Unique Words: 16089
Number of Words we will use: 12531
Percent of Words we will use: 77.89%


In [29]:
# Need to use 300 features for embedding dimensions to match Glove's vectors
embedding_dimension = 300

vocabulary_size = len(vocab_to_int)

# Create a matrix with default values of zero
word_vectors = {}

for word, i in int_to_vocab.items():
    if word in glove_word_vectors:
        word_vectors[i] = glove_word_vectors[word]
    else:
        # If word not in GloVe, create a random embedding for it
        random_word_vector = np.array(np.random.uniform(-1.0, 1.0, embedding_dimension))
        word_vectors[i] = random_word_vector

In [30]:
# Check if value matches len(vocab_to_int)
print(len(word_vectors))

12531


In [31]:
del glove_word_vectors

# 5. Print word_vectors to a file

In [32]:
import csv

In [33]:
word_vectors['bitcoin']

array([ 0.56166086, -0.42181205,  0.4805779 , -0.42246179, -0.26790248,
       -0.32090736, -0.91195876, -0.90963409, -0.56749001,  0.94145851,
       -0.26286334, -0.34662148,  0.43378484, -0.28257271,  0.26442837,
       -0.43119897, -0.55740414, -0.99996626, -0.98631349,  0.63943101,
        0.92791511,  0.76446702, -0.91665483,  0.52439546,  0.78965406,
        0.81536737,  0.97799683,  0.09204672, -0.55212957,  0.0411007 ,
       -0.70615013,  0.21369788,  0.86113296,  0.63482808, -0.30654069,
        0.44628239,  0.8688643 ,  0.67241658, -0.58257331,  0.23834374,
       -0.7588579 ,  0.16176557, -0.27261337,  0.90107218,  0.26564669,
        0.22922821, -0.54029966, -0.76856403, -0.45154211, -0.82929492,
       -0.72300173, -0.7331348 ,  0.2440029 ,  0.47966638,  0.80892526,
       -0.34214966,  0.31892246,  0.96450976,  0.67479979,  0.46164475,
       -0.57025066,  0.03859755, -0.46263229,  0.79078072, -0.28424882,
       -0.41528115, -0.11224276,  0.5306735 , -0.87279022, -0.08

In [34]:
def get_features(word_vector, key):
    features = []
    for feature in word_vector:
        features.append(feature)
        
    return features   

In [35]:
csv_file = "word_vectors.csv"
csv_columns = ['key', 'vector']
with open(csv_file, 'w', newline='', encoding="utf-8") as file:
        writer = csv.writer(file, quoting=csv.QUOTE_NONNUMERIC)
        writer.writerow(csv_columns)
        
        for key in word_vectors.keys():
            word_vector = word_vectors[key]    
            features = get_features(word_vector, key)    
            writer.writerow([key, features])