In [1]:
import pandas as pd
import re
import datetime
import numpy as np

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from textblob import TextBlob

# Define Functions

In [2]:
def clean_tweet(tweet):
    '''
    Utility function to clean tweet text by removing links, special characters
    using simple regex statements.
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

def get_tweet_sentiment(tweet):
        
    '''
    Utility function to classify sentiment of passed tweet
    using textblob's sentiment method
    '''
    # create TextBlob object of passed tweet text
    analysis = TextBlob(tweet)

    return analysis.sentiment.polarity

def truncate_time(time):
    return time.replace(hour=0 ,minute=0, second=0, microsecond=0)

sid = SentimentIntensityAnalyzer()  
def vader(tweet):  
    ss = sid.polarity_scores(tweet)   
    return (ss['compound'])

def pos_or_not(tweet_sentiment):
    if(tweet_sentiment>0):
        return 1
    else :
        return 0

# Load tweets

In [3]:
tweets = pd.read_csv('Data/tweets.csv')
print (len(tweets))

  interactivity=interactivity, compiler=compiler, result=result)


16865848


# Hashtags + Text = Lowercase

In [4]:
tweets['hashtags'] = tweets['hashtags'].str.lower()
tweets['tweet'] = tweets['tweet'].str.lower()

# Remove Duplicates 

In [5]:
tweets = tweets.drop_duplicates(subset = 'tweet')
print(len(tweets))

11385573


# Map the functions

In [6]:
#Clean the text
tweets['text'] = tweets['tweet'].map(clean_tweet)

#Polarity by vader
tweets['polarity_vader'] = tweets['text'].map(vader)

#Polarity by textblob
tweets['polarity_textblob'] = tweets['text'].map(get_tweet_sentiment)

# Create Timestamp
tweets['timestamp'] = pd.to_datetime(tweets['date'] + ' ' + tweets['time'])
tweets['timestamp'] = tweets['timestamp'].map(truncate_time)

In [7]:
tweets = tweets[['text', 'timestamp', 'polarity_textblob', 'polarity_vader', 'hashtags']]
tweets = tweets.rename(columns={'text': 'Text', 'polarity_textblob': 'Polarity_Textblob', 'timestamp' : 'Timestamp', 'polarity_vader' : 'Polarity_Vader', 'hashtags' : 'Hashtags'})

# Remove tweets with noise

# 1. Hashtags

In [8]:
to_drop = ['#lottery', '#makemoney', '#free', '#bet', '#freebitcoin', '#webbot', '#freeminingsoftware',
          '#yabtcl', '#bitcoinbet','#tradingtool', '#trading', '#residualbitcoin', '#faucet', '#venezuela', '#casino'
          '#sportsbook','#soccer', '#game', '#simplefx', '#nitrogensportsbook', '#makemoney', '#makeyourownlane', 
          '#livescoregoal', '#livescore', '#bitcoinprice', '#price', '#mpgvip', '#footballcoin', '#earnbitcoin']

for i in to_drop:
    tweets = tweets[tweets.Hashtags.str.contains(i) == False]
print (len(tweets))

10830629


# 2. Spesific Words

In [9]:
to_drop = ['free', 'trading', 'price', 'win', 'game', 'performing currency', 'altcoin', 'fintech', 'pic']

for i in to_drop:
    tweets = tweets[tweets.Text.str.contains(i) == False]
print (len(tweets))

5867274


# Merge tweets By Timestamp

In [10]:
tweets_by_timestamp = tweets.groupby(['Timestamp'], as_index=False).mean()

# Count Tweets per Day

In [11]:
count = pd.DataFrame(data = tweets.Timestamp.value_counts())
count.reset_index(inplace=True)
count.columns = ['Timestamp', 'Count_of_Tweets']
tweets_by_timestamp = pd.merge(tweets_by_timestamp, count)

# Count Possitive tweets per Day

In [12]:
tweets['Positive_Or_Not'] = tweets['Polarity_Vader'].map(pos_or_not)
tweets_count = tweets.groupby(['Timestamp'], as_index=False).sum()
tweets_by_timestamp['Count_Of_Possitive_Tweets'] = tweets_count['Positive_Or_Not']

# Load Bitcoin and Google Trends Datasets 

In [13]:
btc = pd.read_csv('Data/Coinbase_BTCUSD_d.csv', skiprows=1)
btc = btc.rename(columns = {"Date" : "Timestamp"})
btc['Timestamp'] = pd.to_datetime(btc['Timestamp'])

google_trends = pd.read_csv('Data/google_trends.csv')
google_trends['Timestamp'] = pd.to_datetime(google_trends['Timestamp'])

print(len(tweets))
print(len(btc))
print(len(google_trends))

5867274
1551
365


In [14]:
data1 = pd.merge(tweets_by_timestamp, btc , on='Timestamp', how='inner')
data = pd.merge(data1, google_trends , on='Timestamp', how='inner')

#Last modifications
data.drop('Unnamed: 0', axis=1,inplace = True)

In [15]:
data.to_csv('data.csv')
len(data)

364