In [1]:
import os
import json
import pandas as pd
import numpy as np
import requests
import time

import tweepy

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
import re

from __private import CONSUMER_API_KEY
from __private import CONSUMER_API_KEY_SECRET
from __private import ACCESS_TOKEN
from __private import ACCESS_TOKEN_SECRET

In [2]:
# tweepy API
auth = tweepy.OAuthHandler(CONSUMER_API_KEY, CONSUMER_API_KEY_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)
    
def get_tweet_retweet_favorite_by_id(tweet_id):
    try:
        status = api.get_status(tweet_id)
        tweet = status._json
        retweet = tweet['retweet_count']
        favorite = tweet['favorite_count']
    except:
        retweet = 0
        favorite = 0
    return retweet, favorite

# get_tweet_retweet_favorite_by_id(726061616869961728)

In [3]:
# sentiment analysis
analyzer = SentimentIntensityAnalyzer()

def analyzeSentimentByVader_one_sentence(sentence):
    vs = analyzer.polarity_scores(sentence)

    if vs['compound'] >= 0.05:
        return 1 # positive
    elif vs['compound'] <= 0.05 and vs['compound'] > -0.05:
        return 0 # neutral
    elif vs['compound'] <= -0.05:
        return -1 # negative

    return -2 # should not be in here

In [9]:
# year_month_key_list = ['2016_04','2016_05','2016_06','2016_07','2016_08','2016_09','2016_10','2016_11','2016_12',
#                        '2017_01','2017_02', '2017_03','2017_04','2017_05','2017_06','2017_07','2017_08','2017_09',
#                        '2017_10','2017_11','2017_12', '2018_01', '2018_02', '2018_03', '2018_04', '2018_05', 
#                        '2018_06', '2018_07', '2018_08', '2018_09', '2018_10', '2018_11', '2018_12', '2019_01', 
#                        '2019_02', '2019_03', '2019_04' ]
year_month_key_list = ['2017_12']
file_path = "/mnt/volume-5T/result/"

In [10]:
text_total_dict = {}
text_total_dict_no_RE = {}
sentiment_total_dict = {}
retweet_total_dict = {}
favorite_total_dict = {}

for year_month in year_month_key_list:
    # Wait for 20 mins
#     time.sleep(1200)
    
    # e-cigarette
#     file_name = 'juliana_SF_allECigarette_'+year_month+'_categoryKeyWord.json'
    
    # tobacco
    file_name = 'juliana_allSF_tobacco_'+year_month+'.json'
    
    # flavored tobacco
#     file_name = 'juliana_allSF_flavored_tobacco_'+year_month+'.json'
    
    file = file_path + file_name
    
    # initialization
    text_total_dict[year_month] = []
    text_total_dict_no_RE[year_month] = []
    sentiment_total_dict[year_month] = []
    retweet_total_dict[year_month] = []
    favorite_total_dict[year_month] = []
    
    with open(file, "r") as f:
        everyLines = f.readlines()
        num_of_tweets = len(everyLines)
        print('length of tweets in month %s: %d' % (year_month, num_of_tweets))
#         tweets_total_list.append(num_of_tweets)
        for line in everyLines:
            try:
                lineInJson = json.loads(line)
            except:
                print('Cannot load the line: ', line)
                
            text_inLine = lineInJson["text"]
            text_total_dict_no_RE[year_month].append(text_inLine)
            text_afterRE = re.sub('https?:\/\/t\.co\/[\s\S]{10}', '', text_inLine)
            text_total_dict[year_month].append(text_afterRE)
            
            # get sentiment
            sentiment_sentence = analyzeSentimentByVader_one_sentence(text_afterRE)
            if sentiment_sentence == -2:
                print('Something wrong. The sentiment cannot be -2')
            else:
                sentiment_total_dict[year_month].append(sentiment_sentence)
            
            # get retweet and favorite
            id_inLine = lineInJson['id']
            retweet_inLine, favorite_inLine = get_tweet_retweet_favorite_by_id(id_inLine)
            retweet_total_dict[year_month].append(retweet_inLine)
            favorite_total_dict[year_month].append(favorite_inLine)
            
#             print('finished processing 1 line.')
            
#         break
    
print('length of text_total_dict: ', len(text_total_dict))
print('length of sentiment_total_dict: ', len(sentiment_total_dict))
print('sentiment_total_dict: ', sentiment_total_dict)
print('retweet_total_dict: ', retweet_total_dict)
print('favorite_total_dict: ', favorite_total_dict)

length of tweets in month 2017_12: 485
length of text_total_dict:  1
length of sentiment_total_dict:  1
sentiment_total_dict:  {'2017_12': [1, 0, -1, -1, 1, 0, 0, 0, 0, 1, 0, 0, 0, -1, 1, 1, 0, 0, 0, 0, 0, 1, -1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, -1, 0, 0, -1, 0, 1, 0, -1, 0, 0, 0, 0, 0, 0, 0, -1, 1, -1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, -1, 1, 0, -1, 1, -1, 0, 0, 0, 0, -1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, -1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, -1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, -1, 1, 0, 0, 1, -1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, -1, 0, 1, 0, 0, 0, 1, -1, -1, 0, 0, 0, 0, 0, 1, 0, 0, -1, 0, -1, 0, 1, 0, -1, 0, -1, 1, 0, 0, 0, 1, -1, 1, 0, 0, -1, -1, -1, 0, 0, 0, 1, 1, 0, 0, -1, 0, 0, 1, 1, 0, 1, 1, -1, 0, -1, -1, -1, 0, -1, 0, 0, 1, 0, -1, -1, 0, 1, 0, 0, -1, 0, 0, 0, -1, 0, 0, -1, 1, 1, 0, -1, -1, 0, 0, 0, 0, 0, -1, 0, 0, 0, 1, 0, 1, -1, 0, 1, 1, 1, 1, 0, 1, -1, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, -1, 0, 0, -1, 0, 0, 0, 1,

### get the tweet with most favorite and retweet

In [11]:
# positive_retweet_num_dict = {}
# neutral_retweet_num_dict = {}
# negative_retweet_num_dict = {}

# positive_favorite_num_dict = {}
# neutral_favorite_num_dict = {}
# negative_favorite_num_dict = {}

positive_weight_num_dict = {}
neutral_weight_num_dict = {}
negative_weight_num_dict = {}

for year_month_index in range(len(year_month_key_list)):
    year_month = year_month_key_list[year_month_index]
    positive_weight_num_dict[year_month] = {}
    neutral_weight_num_dict[year_month] = {}
    negative_weight_num_dict[year_month] = {}

for year_month_index in range(len(year_month_key_list)):
    year_month = year_month_key_list[year_month_index]
    for i in range(len(sentiment_total_dict[year_month])):
        sentiment = sentiment_total_dict[year_month][i]
        retweet = retweet_total_dict[year_month][i]
        favorite = favorite_total_dict[year_month][i]
        tweet_text = text_total_dict[year_month][i]
#         tweet_text = text_total_dict_no_RE[year_month][i]
        
        weight = retweet + favorite
        
        
        if sentiment == 1:
            if weight not in positive_weight_num_dict[year_month]:
                positive_weight_num_dict[year_month][weight] = []
            positive_weight_num_dict[year_month][weight].append((favorite, retweet, tweet_text))
#             positive_retweet_num_list[year_month_index] += retweet
#             positive_favorite_num_list[year_month_index] += favorite
        elif sentiment == 0:
            if weight not in neutral_weight_num_dict[year_month]:
                neutral_weight_num_dict[year_month][weight] = []
            neutral_weight_num_dict[year_month][weight].append((favorite, retweet, tweet_text))
#             neutral_retweet_num_list[year_month_index] += retweet
#             neutral_favorite_num_list[year_month_index] += favorite
        elif sentiment == -1:
            if weight not in negative_weight_num_dict[year_month]:
                negative_weight_num_dict[year_month][weight] = []
            negative_weight_num_dict[year_month][weight].append((favorite, retweet, tweet_text))
#             negative_retweet_num_list[year_month_index] += retweet
#             negative_favorite_num_list[year_month_index] += favorite
        else:
            print('sentiment cannot be other nunmber besides 1, 0, -1')
        
print('positive_weight_num_dict: ', positive_weight_num_dict)
print('neutral_weight_num_dict: ', neutral_weight_num_dict)
print('negative_weight_num_dict: ', negative_weight_num_dict)

positive_weight_num_dict:  {'2017_12': {0: [(0, 0, 'Cool. '), (0, 0, 'Cool! '), (0, 0, 'N e w S t y l e  F e a t u r e ! . .\nGo check on my website like in bio ..\nFeaturing:… '), (0, 0, 'Wow I forgot to buy Chile Serrano'), (0, 0, 'We’re in a rich hipster bar'), (0, 0, '@_mrcarmack on @Slicr_App Now 50% off. Get the Carmack pack in app for free. \n•\n•\n•\n\n#samples… '), (0, 0, '@VoicesofBay @grlreporter Cool relics of Americana.'), (0, 0, 'At urban pharm smoking good'), (0, 0, '@evanderrick @VanRyderGames @Cephalofair @Avett_Fan Did he smile a cigarette after?'), (0, 0, '@katetscott @pac12 And @Pac12Network commentators 😂'), (0, 0, '@LeslieFulbright Cool! 😎😎😎'), (0, 0, 'Is it safe to Vape? #like '), (0, 0, 'Just like orange juice. Yum! - Drinking a Major Hazer by @cellarmakerbeer at @cellarmakerbeer  —  #photo'), (0, 0, '@diegoescosteguy @AnaPaulaVolei Ela está muito bem e saudável - congratulations!'), (0, 0, 'I️ want to d i e e e e e'), (0, 0, 'Baaaaa ha ha ha ha ha ha ha ha ha h

In [12]:
def get_most_weight_tweets(input_dict):
    for key in sorted(input_dict, reverse=True):
#         print('key: ', key)
        for tweet_item in input_dict[key]:
            print('-', tweet_item)
#         print(input_dict[key])
#         print()
        
for year_month_index in range(len(year_month_key_list)):
    year_month = year_month_key_list[year_month_index]
#     get_most_weight_tweets(positive_weight_num_dict[year_month])
#     get_most_weight_tweets(neutral_weight_num_dict[year_month])
    get_most_weight_tweets(negative_weight_num_dict[year_month])

- (309, 13, 'Getting my 4th Keytruda infusion right now- gonna beat this cancer!')
- (47, 4, '@TopherSpiro they’re like arsonists bitching about smoke damage')
- (17, 2, '2018: more veggies, more books, more water, no cigarettes, no checking on exes. No excuses. Time to grow and move forward y’all.')
- (13, 0, 'i’d rather kill myself drinking hard alcohol than sip beer')
- (7, 6, 'fuck cancer')
- (7, 1, 'People been smoking weed for thousands of years and it it killed nobody yet.\nMark my words; that flavored Vape shit… ')
- (4, 3, 'Y’all be too fucking depressed on the TL!\nGo smoke and listen to some music!')
- (6, 0, 'Been back in SF 2 hours and already watched a woman slam a glass of wine and knock a dude out in a dive bar followe… ')
- (4, 0, '♬ DEATH GUILD at DNA Lounge starting now!  ')
- (4, 0, 'Cigarettes r disgusting ew')
- (4, 0, 'I’m a drunk ass mess outside this bar tbh')
- (2, 1, 'if I can’t get cigarettes after sex tickets next week ima kill myself')
- (3, 0, 'LIT 🔥 16 #

### Debug - date after March, 2017 

In [13]:
def get_tweet_retweet_favorite_temp(tweet_id):
#     try:
    status = api.get_status(tweet_id)
    tweet = status._json
    retweet = tweet['retweet_count']
    favorite = tweet['favorite_count']
#     except:
#         retweet = 0
#         favorite = 0
    return retweet, favorite

for year_month in ['2017_04']:
    # e-cigarette
    file_name = 'juliana_SF_allECigarette_'+year_month+'_categoryKeyWord.json'
    
    # tobacco
#     file_name = 'juliana_allSF_tobacco_'+year_month+'.json'
    
    # flavored tobacco
#     file_name = 'juliana_allSF_flavored_tobacco_'+year_month+'.json'
    
    file = file_path + file_name
    with open(file, "r") as f:
        everyLines = f.readlines()
        num_of_tweets = len(everyLines)
        print('length of tweets in month %s: %d' % (year_month, num_of_tweets))
#         tweets_total_list.append(num_of_tweets)
        for line in everyLines:
            try:
                lineInJson = json.loads(line)
            except:
                print('Cannot load the line: ', line)
            
            # get retweet and favorite
            id_inLine = lineInJson['id']
            retweet_inLine, favorite_inLine = get_tweet_retweet_favorite_temp(id_inLine)
            print('favorite_inLine: ', favorite_inLine)
            break

length of tweets in month 2017_04: 59
favorite_inLine:  0


Tweepy API has rate limit. We may exceed the limit every 15 mins.