In [1]:
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import json
import re
from textblob import TextBlob
import datetime
import numpy as np



# Define Functions

In [2]:
def clean_tweet(tweet):
    '''
    Utility function to clean tweet text by removing links, special characters
    using simple regex statements.
    '''
    return ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", tweet).split())

def get_tweet_sentiment(tweet):
        
    '''
    Utility function to classify sentiment of passed tweet
    using textblob's sentiment method
    '''
    # create TextBlob object of passed tweet text
    analysis = TextBlob(tweet)

    return analysis.sentiment.polarity

def truncate_time(time):
    return time.replace(hour=0 ,minute=0, second=0, microsecond=0)

def emoticons(tweet):
    if ':)' in tweet:
        return 1
    elif ':(' in tweet:
        return -1
    else:
        return 0
    
def alot_likes(tweet):
    if tweet>10:
        return 1
    else:
        return 0

sid = SentimentIntensityAnalyzer()  
def vader(tweet):  
    ss = sid.polarity_scores(tweet)   
    return (ss['compound'])

# First import everything I have from the days (Fill the gaps of the weeks)

In [3]:
list_ = []
for i in range(1,13):
    try:
        df = pd.read_json('Data/By Day/2017'+str(i)+'.json', lines = True, convert_dates = False)
        print (len(df))
        list_.append(df)
    except:
        print ('Problem with :',i)

12457
15693
17942
19270
17310
35487
28723
1552
18410
12667
40330
33427


# Import everithing I have from Weeks

In [4]:
for i in range(1,17):
    try:
        df = pd.read_json('Data/By Week/week'+str(i)+'.json', lines = True, convert_dates = False)
        print (len(df))
        list_.append(df)
    except:
        print ('Problem with :',i)

58337
104261
137591
Problem with : 4
117920
124157
78228
86117
78722
158893
34029
107063
107669
129875
102954
118863


# Import months

In [5]:
for i in range(5,14):
    try:
        df = pd.read_json('Data/By month/month'+str(i)+'.json', lines = True, convert_dates = False)
        print (len(df))
        list_.append(df)
    except:
        print ('Problem with :',i)

621642
610130
270958
136468
776692
802065
1167790
1406214
1089825


# Import fill Datasets

In [6]:
df = pd.read_json("Data/tweets3.json", encoding ="utf-8", lines=True, convert_dates = False)
print (len(df))
list_.append(df)
df = pd.read_json("Data/fill0.json", encoding ="utf-8", lines=True, convert_dates = False)
print (len(df))
list_.append(df)
df = pd.read_json("Data/fill.json", encoding ="utf-8", lines=True, convert_dates = False)
print (len(df))
list_.append(df)

828852
37332
194235


# Concat Them all 

In [7]:
tweets = pd.concat(list_)
print (len(tweets))

9740150


# Remove Duplicates

In [8]:
tweets = tweets.drop_duplicates()
print(len(tweets))

8808386


# See how much of each date 

In [15]:
df = tweets 
for i in range (1,33):
    if i in [1,2,3,4,5,6,7,8,9] :
        df = tweets 
        condition = df['date'] == '2017-01-0'+str(i)
        df = df[condition]
        print (len(df))
        
        if (len(df))< 10000 :
            print (df['date'] )
    else :
        df = tweets 
        condition = df['date'] == '2017-01-'+str(i)
        df = df[condition]
        print (len(df))
        
        if (len(df))< 10000 :
            print (df['date'] )

12457
15693
17942
19217
23085
22120
18487
10353
5745
0        2017-01-09
1        2017-01-09
2        2017-01-09
3        2017-01-09
4        2017-01-09
5        2017-01-09
6        2017-01-09
7        2017-01-09
8        2017-01-09
9        2017-01-09
10       2017-01-09
11       2017-01-09
12       2017-01-09
13       2017-01-09
14       2017-01-09
15       2017-01-09
16       2017-01-09
17       2017-01-09
18       2017-01-09
19       2017-01-09
20       2017-01-09
21       2017-01-09
22       2017-01-09
23       2017-01-09
24       2017-01-09
25       2017-01-09
26       2017-01-09
27       2017-01-09
28       2017-01-09
29       2017-01-09
            ...    
17607    2017-01-09
17608    2017-01-09
17609    2017-01-09
17610    2017-01-09
17611    2017-01-09
17612    2017-01-09
17613    2017-01-09
17614    2017-01-09
17615    2017-01-09
17616    2017-01-09
17617    2017-01-09
17618    2017-01-09
17619    2017-01-09
17620    2017-01-09
17621    2017-01-09
17622    2017-01-09
17623  

# Preprocess the Dataset

In [None]:
#Clean the text
tweets['text'] = tweets['tweet'].map(clean_tweet)

# 1 if more than 10 likes
tweets['A_Lot_Likes'] = tweets['likes'].map(alot_likes)

# 1 If have emoticon :) -1 otherwise and 0 at neutral
tweets['Emoticons'] = tweets['text'].map(emoticons)

#Polarity by vader
tweets['polarity_vader'] = tweets['text'].map(vader)

#Polarity by textblob
tweets['polarity_textblob'] = tweets['text'].map(get_tweet_sentiment)

# Create Timestamp
tweets['timestamp'] = pd.to_datetime(tweets['date'] + ' ' + tweets['time'])
tweets['timestamp'] = tweets['timestamp'].map(truncate_time)

In [None]:
tweets = tweets[['text', 'timestamp', 'polarity_textblob', 'polarity_vader', 'A_Lot_Likes','Emoticons', 'hashtags']]
tweets = tweets.rename(columns={'text': 'Text', 'polarity_textblob': 'Polarity_Textblob', 'timestamp' : 'Timestamp', 'Emoticons' : 'Emoticons',
                               'A_Lot_Likes' : 'A_Lot_Likes','polarity_vader' : 'Polarity_Vader', 'hashtags' : 'Hashtags'})

In [None]:
tweets

In [None]:
tweets.to_csv('tweets_raw_day.csv')
len(tweets)