In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')
os.chdir('/content/drive/My Drive/Colab Notebooks/tcc')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import tensorflow as tf
import pickle
import pandas as pd

from modules.preprocessing import *

In [None]:
class EmotionDetect:
    def __init__(self, vectors_name, w_folder):
        self.w_folder = w_folder
        self.size_word2vec = 300
        self.max_sequences = 35
        # self.vectors = pickle.load(open('w2v_stop'+str(self.size_word2vec)+'.pkl', 'rb'))
        self.vectors = pickle.load(open(vectors_name, "rb"))
        self.emotions = ["anger", "disgust", "fear", "joy", "sadness", "surprise"]
        self.tokenizer = Tokenizer()
        self.loadModels()

    def model_gru(self, embedding_matrix, embed_size):
        # load json and create model
        json_file = open("models/bi-gru300.json", "r")
        loaded_model_json = json_file.read()
        json_file.close()
        return tf.keras.models.model_from_json(loaded_model_json)

    def loadModels(self):
        self.models = dict()
        for s in self.emotions:
            self.models[s] = self.model_gru(self.vectors.wv.vectors, self.size_word2vec)
            self.models[s].load_weights("models/" + s + "_weights.h5")

    def getIndex(self, t):
        try:
            return self.vectors.wv.vocab[t].index

        except:
            return 0

    def predict(self, text):
        x = self.tokenizer.cleanText(text)
        x = [self.getIndex(t) for t in x]
        x = tf.keras.preprocessing.sequence.pad_sequences(
            [x], maxlen=self.max_sequences
        )
        return dict(
            zip(self.emotions, [self.models[s].predict(x)[0][0] for s in self.emotions])
        )

In [None]:
# classifier = EmotionDetect('models/ft_300.pkl', 'models')

In [None]:
from datetime import timedelta, date
from dateutil.relativedelta import relativedelta
from random import randint
from time import sleep

!pip install GetOldTweets3 ratelimit
import GetOldTweets3 as got
from ratelimit import limits, sleep_and_retry

def daterange(start_date, end_date):
    for n in range(int ((end_date - start_date).days)):
        yield start_date + timedelta(n)

@sleep_and_retry
@limits(calls=12, period=900) # 10 calls in 15 minutes
def get_tweets_request(got, tweetCriteria):
        return got.manager.TweetManager.getTweets(tweetCriteria)


def get_data(word, year=2020, month=1):

    start_date = date(year, month, 1)
    end_date = date(year, month, 1) + relativedelta(months=1)

    tweets = []
    for single_date in tqdm(daterange(start_date, end_date), total=(end_date - start_date).days):
        # print(single_date)
        day = single_date.strftime("%Y-%m-%d")
        dayPlus = (single_date + timedelta(days=1)).strftime("%Y-%m-%d")

        tweetCriteria = got.manager.TweetCriteria()\
                                            .setQuerySearch(word)\
                                            .setSince(day)\
                                            .setUntil(dayPlus)\
                                            .setLang('en')\
                                            .setTopTweets(True)\
                                            .setMaxTweets(1000)
        tweet = get_tweets_request(got, tweetCriteria) # got.manager.TweetManager.getTweets(tweetCriteria)

        for t in (tweet):
            # emo = classifier.predict(t.text)
            tweets.append({
                        'date': t.date,
                        'id': t.id,
                        'username': t.username,
                        'text': t.text,
                        'favorites': t.favorites,
                        'retweets': t.retweets,
                        'replies': t.replies,
                        'geo': t.geo
                        # 'anger' : int(emo['anger'] > 0.22),
                        # 'disgust' : int(emo['disgust'] > 0.1),
                        # 'fear': int(emo['fear'] > 0.1),
                        # 'joy' : int(emo['joy'] > 0.1),
                        # 'sadness' : int(emo['sadness'] > 0.1),
                        # 'surprise': int(emo['surprise'] > 0.13),

                        # 'anger_pred': emo['anger'],
                        # 'disgust_pred' : emo['disgust'],
                        # 'fear_pred': emo['fear'],
                        # 'joy_pred': emo['joy'],
                        # 'sadness_pred': emo['sadness'],
                        # 'surprise_pred': emo['surprise']
                        })
        aux = pd.DataFrame(tweets)
        aux.to_csv('data/data_autosave.csv')
        sleep(randint(1,10))
    return tweets



In [None]:
def get_plot(search, start_m=1, start_y=2019, end_m=1, end_y=2020):
    for y, m in month_year_iter(start_m, start_y, end_m, end_y):
        try:
            name = str(y) + str(m)
            tweets = get_data(search, year=y, month=m)
            df = pd.DataFrame(tweets)
            df.to_csv("data/tweets_" + search + "_" + name + ".csv")
        except:
            print("[ERROR]" + "data/tweets_" + search + "_" + name)
            # continue
        # df['date'] = pd.to_datetime(df['date']).dt.normalize()
        # result = df.set_index('date')[['anger', 'disgust', 'fear', 'joy', 'sadness', 'surprise']].resample("D").sum()

        # plot = result.plot(figsize=(20,10))
        # plot.figure.savefig('figures/sent_'+search+'_'+name+'.jpg')

In [None]:
# get_plot('floyd', 6, 2020, 7, 2020) 

In [None]:
# get_plot('police', 5, 2020, 7, 2020) 

In [None]:
get_plot('racism', 5, 2020, 7, 2020) 

100%|██████████| 31/31 [35:59<00:00, 69.65s/it]
100%|██████████| 30/30 [39:55<00:00, 79.87s/it] 


In [None]:
get_plot('#america', 7, 2018, 8, 2018)
get_plot('#america', 7, 2019, 8, 2019)

100%|██████████| 31/31 [30:24<00:00, 58.84s/it]
100%|██████████| 31/31 [44:10<00:00, 85.48s/it] 
