In [1]:
import os
import json
import pandas as pd
import numpy as np
import requests
import time

import tweepy

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
import re

from __private import CONSUMER_API_KEY
from __private import CONSUMER_API_KEY_SECRET
from __private import ACCESS_TOKEN
from __private import ACCESS_TOKEN_SECRET

In [2]:
# tweepy API
auth = tweepy.OAuthHandler(CONSUMER_API_KEY, CONSUMER_API_KEY_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)
    
def get_tweet_retweet_favorite_by_id(tweet_id):
    try:
        status = api.get_status(tweet_id)
        tweet = status._json
        retweet = tweet['retweet_count']
        favorite = tweet['favorite_count']
    except:
        retweet = 0
        favorite = 0
    return retweet, favorite

# get_tweet_retweet_favorite_by_id(726061616869961728)

In [3]:
# sentiment analysis
analyzer = SentimentIntensityAnalyzer()

def analyzeSentimentByVader_one_sentence(sentence):
    vs = analyzer.polarity_scores(sentence)

    if vs['compound'] >= 0.05:
        return 1 # positive
    elif vs['compound'] <= 0.05 and vs['compound'] > -0.05:
        return 0 # neutral
    elif vs['compound'] <= -0.05:
        return -1 # negative

    return -2 # should not be in here

In [9]:
# year_month_key_list = ['2016_04','2016_05','2016_06','2016_07','2016_08','2016_09','2016_10','2016_11','2016_12',
#                        '2017_01','2017_02', '2017_03','2017_04','2017_05','2017_06','2017_07','2017_08','2017_09',
#                        '2017_10','2017_11','2017_12', '2018_01', '2018_02', '2018_03', '2018_04', '2018_05', 
#                        '2018_06', '2018_07', '2018_08', '2018_09', '2018_10', '2018_11', '2018_12', '2019_01', 
#                        '2019_02', '2019_03', '2019_04' ]
year_month_key_list = ['2017_10']
file_path = "/mnt/volume-5T/result/"

In [10]:
text_total_dict = {}
text_total_dict_no_RE = {}
sentiment_total_dict = {}
retweet_total_dict = {}
favorite_total_dict = {}

for year_month in year_month_key_list:
    # Wait for 20 mins
#     time.sleep(1200)
    
    # e-cigarette
#     file_name = 'juliana_SF_allECigarette_'+year_month+'_categoryKeyWord.json'
    
    # tobacco
#     file_name = 'juliana_allSF_tobacco_'+year_month+'.json'
    
    # flavored tobacco
    file_name = 'juliana_allSF_flavored_tobacco_'+year_month+'.json'
    
    file = file_path + file_name
    
    # initialization
    text_total_dict[year_month] = []
    text_total_dict_no_RE[year_month] = []
    sentiment_total_dict[year_month] = []
    retweet_total_dict[year_month] = []
    favorite_total_dict[year_month] = []
    
    with open(file, "r") as f:
        everyLines = f.readlines()
        num_of_tweets = len(everyLines)
        print('length of tweets in month %s: %d' % (year_month, num_of_tweets))
#         tweets_total_list.append(num_of_tweets)
        for line in everyLines:
            try:
                lineInJson = json.loads(line)
            except:
                print('Cannot load the line: ', line)
                
            text_inLine = lineInJson["text"]
            text_total_dict_no_RE[year_month].append(text_inLine)
            text_afterRE = re.sub('https?:\/\/t\.co\/[\s\S]{10}', '', text_inLine)
            text_total_dict[year_month].append(text_afterRE)
            
            # get sentiment
            sentiment_sentence = analyzeSentimentByVader_one_sentence(text_afterRE)
            if sentiment_sentence == -2:
                print('Something wrong. The sentiment cannot be -2')
            else:
                sentiment_total_dict[year_month].append(sentiment_sentence)
            
            # get retweet and favorite
            id_inLine = lineInJson['id']
            retweet_inLine, favorite_inLine = get_tweet_retweet_favorite_by_id(id_inLine)
            retweet_total_dict[year_month].append(retweet_inLine)
            favorite_total_dict[year_month].append(favorite_inLine)
            
#             print('finished processing 1 line.')
            
#         break
    
print('length of text_total_dict: ', len(text_total_dict))
print('length of sentiment_total_dict: ', len(sentiment_total_dict))
print('sentiment_total_dict: ', sentiment_total_dict)
print('retweet_total_dict: ', retweet_total_dict)
print('favorite_total_dict: ', favorite_total_dict)

length of tweets in month 2017_10: 27
length of text_total_dict:  1
length of sentiment_total_dict:  1
sentiment_total_dict:  {'2017_10': [1, -1, -1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, -1, 1, 0]}
retweet_total_dict:  {'2017_10': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]}
favorite_total_dict:  {'2017_10': [0, 0, 5, 0, 0, 10, 1, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0, 3, 0, 0, 0, 1, 11, 0, 3, 0]}


### get the tweet with most favorite and retweet

In [11]:
# positive_retweet_num_dict = {}
# neutral_retweet_num_dict = {}
# negative_retweet_num_dict = {}

# positive_favorite_num_dict = {}
# neutral_favorite_num_dict = {}
# negative_favorite_num_dict = {}

positive_weight_num_dict = {}
neutral_weight_num_dict = {}
negative_weight_num_dict = {}

for year_month_index in range(len(year_month_key_list)):
    year_month = year_month_key_list[year_month_index]
    positive_weight_num_dict[year_month] = {}
    neutral_weight_num_dict[year_month] = {}
    negative_weight_num_dict[year_month] = {}

for year_month_index in range(len(year_month_key_list)):
    year_month = year_month_key_list[year_month_index]
    for i in range(len(sentiment_total_dict[year_month])):
        sentiment = sentiment_total_dict[year_month][i]
        retweet = retweet_total_dict[year_month][i]
        favorite = favorite_total_dict[year_month][i]
        tweet_text = text_total_dict[year_month][i]
#         tweet_text = text_total_dict_no_RE[year_month][i]
        
        weight = retweet + favorite
        
        
        if sentiment == 1:
            if weight not in positive_weight_num_dict[year_month]:
                positive_weight_num_dict[year_month][weight] = []
            positive_weight_num_dict[year_month][weight].append((favorite, retweet, tweet_text))
#             positive_retweet_num_list[year_month_index] += retweet
#             positive_favorite_num_list[year_month_index] += favorite
        elif sentiment == 0:
            if weight not in neutral_weight_num_dict[year_month]:
                neutral_weight_num_dict[year_month][weight] = []
            neutral_weight_num_dict[year_month][weight].append((favorite, retweet, tweet_text))
#             neutral_retweet_num_list[year_month_index] += retweet
#             neutral_favorite_num_list[year_month_index] += favorite
        elif sentiment == -1:
            if weight not in negative_weight_num_dict[year_month]:
                negative_weight_num_dict[year_month][weight] = []
            negative_weight_num_dict[year_month][weight].append((favorite, retweet, tweet_text))
#             negative_retweet_num_list[year_month_index] += retweet
#             negative_favorite_num_list[year_month_index] += favorite
        else:
            print('sentiment cannot be other nunmber besides 1, 0, -1')
        
print('positive_weight_num_dict: ', positive_weight_num_dict)
print('neutral_weight_num_dict: ', neutral_weight_num_dict)
print('negative_weight_num_dict: ', negative_weight_num_dict)

positive_weight_num_dict:  {'2017_10': {0: [(0, 0, 'Very cherry with good tart - Drinking a Vape Tricks by @Prairieales @ Archive Bar and Kitchen  — '), (0, 0, 'It was cool seeing the sun in a orange hue because of the smoke.'), (0, 0, '@garrytan It almost sounds like cigarettes or tinted moisturizer now (CC, BB cream)'), (0, 0, "Starting my Sunday morning with coffee and @TheDefeatedPod, 24 and sunny out, squad reunited, it's all good 🌈 ")], 4: [(3, 1, 'I’d love to buy all my favorite cops a coffee tomorrow #SFPD ')], 1: [(1, 0, 'Morning workout, free coffee &amp; bfast, and a nice walk downtown...I’m loving today')], 11: [(11, 0, 'Drinking a large coffee at 10pm Pacific wasn’t the brightest idea.')], 3: [(3, 0, '@DianePodcast Lynch directing on set with a cigarette, megaphone, and fire extinguisher is a wonderfully evocative image. #TwinPeaks')]}}
neutral_weight_num_dict:  {'2017_10': {0: [(0, 0, 'calamansi juice &gt;&gt;&gt;&gt;&gt; lemonade 🍋'), (0, 0, 'Earlier chilling in hotel ro

In [12]:
def get_most_weight_tweets(input_dict):
    for key in sorted(input_dict, reverse=True):
#         print('key: ', key)
        for tweet_item in input_dict[key]:
            print('-', tweet_item)
#         print(input_dict[key])
#         print()
        
for year_month_index in range(len(year_month_key_list)):
    year_month = year_month_key_list[year_month_index]
    get_most_weight_tweets(positive_weight_num_dict[year_month])
#     get_most_weight_tweets(neutral_weight_num_dict[year_month])
#     get_most_weight_tweets(negative_weight_num_dict[year_month])

- (11, 0, 'Drinking a large coffee at 10pm Pacific wasn’t the brightest idea.')
- (3, 1, 'I’d love to buy all my favorite cops a coffee tomorrow #SFPD ')
- (3, 0, '@DianePodcast Lynch directing on set with a cigarette, megaphone, and fire extinguisher is a wonderfully evocative image. #TwinPeaks')
- (1, 0, 'Morning workout, free coffee &amp; bfast, and a nice walk downtown...I’m loving today')
- (0, 0, 'Very cherry with good tart - Drinking a Vape Tricks by @Prairieales @ Archive Bar and Kitchen  — ')
- (0, 0, 'It was cool seeing the sun in a orange hue because of the smoke.')
- (0, 0, '@garrytan It almost sounds like cigarettes or tinted moisturizer now (CC, BB cream)')
- (0, 0, "Starting my Sunday morning with coffee and @TheDefeatedPod, 24 and sunny out, squad reunited, it's all good 🌈 ")


### Debug - date after March, 2017 

In [13]:
def get_tweet_retweet_favorite_temp(tweet_id):
#     try:
    status = api.get_status(tweet_id)
    tweet = status._json
    retweet = tweet['retweet_count']
    favorite = tweet['favorite_count']
#     except:
#         retweet = 0
#         favorite = 0
    return retweet, favorite

for year_month in ['2017_04']:
    # e-cigarette
    file_name = 'juliana_SF_allECigarette_'+year_month+'_categoryKeyWord.json'
    
    # tobacco
#     file_name = 'juliana_allSF_tobacco_'+year_month+'.json'
    
    # flavored tobacco
#     file_name = 'juliana_allSF_flavored_tobacco_'+year_month+'.json'
    
    file = file_path + file_name
    with open(file, "r") as f:
        everyLines = f.readlines()
        num_of_tweets = len(everyLines)
        print('length of tweets in month %s: %d' % (year_month, num_of_tweets))
#         tweets_total_list.append(num_of_tweets)
        for line in everyLines:
            try:
                lineInJson = json.loads(line)
            except:
                print('Cannot load the line: ', line)
            
            # get retweet and favorite
            id_inLine = lineInJson['id']
            retweet_inLine, favorite_inLine = get_tweet_retweet_favorite_temp(id_inLine)
            print('favorite_inLine: ', favorite_inLine)
            break

length of tweets in month 2017_04: 59
favorite_inLine:  0


Tweepy API has rate limit. We may exceed the limit every 15 mins.