In [11]:
import os
import json
import pandas as pd
import numpy as np
import requests
import time

import tweepy

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
import re

from __private import CONSUMER_API_KEY
from __private import CONSUMER_API_KEY_SECRET
from __private import ACCESS_TOKEN
from __private import ACCESS_TOKEN_SECRET

In [3]:
# tweepy API
auth = tweepy.OAuthHandler(CONSUMER_API_KEY, CONSUMER_API_KEY_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)
    
def get_tweet_retweet_favorite_by_id(tweet_id):
    try:
        status = api.get_status(tweet_id)
        tweet = status._json
        retweet = tweet['retweet_count']
        favorite = tweet['favorite_count']
    except:
        retweet = 0
        favorite = 0
    return retweet, favorite

# get_tweet_retweet_favorite_by_id(726061616869961728)

In [4]:
# sentiment analysis
analyzer = SentimentIntensityAnalyzer()

def analyzeSentimentByVader_one_sentence(sentence):
    vs = analyzer.polarity_scores(sentence)

    if vs['compound'] >= 0.05:
        return 1 # positive
    elif vs['compound'] <= 0.05 and vs['compound'] > -0.05:
        return 0 # neutral
    elif vs['compound'] <= -0.05:
        return -1 # negative

    return -2 # should not be in here

In [9]:
year_month_key_list = ['2016_04','2016_05','2016_06','2016_07','2016_08','2016_09','2016_10','2016_11','2016_12',
                       '2017_01','2017_02', '2017_03','2017_04','2017_05','2017_06','2017_07','2017_08','2017_09',
                       '2017_10','2017_11','2017_12', '2018_01', '2018_02', '2018_03', '2018_04', '2018_05', 
                       '2018_06', '2018_07', '2018_08', '2018_09', '2018_10', '2018_11', '2018_12', '2019_01', 
                       '2019_02', '2019_03', '2019_04' ]
# year_month_key_list = ['2016_04']
file_path = "/mnt/volume-5T/result/"

In [12]:
text_total_dict = {}
sentiment_total_dict = {}
retweet_total_dict = {}
favorite_total_dict = {}

for year_month in year_month_key_list:
    # Wait for 20 mins
    time.sleep(1200)
    
    # e-cigarette
    file_name = 'juliana_SF_allECigarette_'+year_month+'_categoryKeyWord.json'
    
    # tobacco
#     file_name = 'juliana_allSF_tobacco_'+year_month+'.json'
    
    # flavored tobacco
#     file_name = 'juliana_allSF_flavored_tobacco_'+year_month+'.json'
    
    file = file_path + file_name
    
    # initialization
    text_total_dict[year_month] = []
    sentiment_total_dict[year_month] = []
    retweet_total_dict[year_month] = []
    favorite_total_dict[year_month] = []
    
    with open(file, "r") as f:
        everyLines = f.readlines()
        num_of_tweets = len(everyLines)
        print('length of tweets in month %s: %d' % (year_month, num_of_tweets))
#         tweets_total_list.append(num_of_tweets)
        for line in everyLines:
            try:
                lineInJson = json.loads(line)
            except:
                print('Cannot load the line: ', line)
                
            text_inLine = lineInJson["text"]
            text_afterRE = re.sub('https?:\/\/t\.co\/[\s\S]{10}', '', text_inLine)
            text_total_dict[year_month].append(text_afterRE)
            
            # get sentiment
            sentiment_sentence = analyzeSentimentByVader_one_sentence(text_afterRE)
            if sentiment_sentence == -2:
                print('Something wrong. The sentiment cannot be -2')
            else:
                sentiment_total_dict[year_month].append(sentiment_sentence)
            
            # get retweet and favorite
            id_inLine = lineInJson['id']
            retweet_inLine, favorite_inLine = get_tweet_retweet_favorite_by_id(id_inLine)
            retweet_total_dict[year_month].append(retweet_inLine)
            favorite_total_dict[year_month].append(favorite_inLine)
            
#     break
    
print('length of text_total_dict: ', len(text_total_dict))
print('length of sentiment_total_dict: ', len(sentiment_total_dict))
print('sentiment_total_dict: ', sentiment_total_dict)
print('retweet_total_dict: ', retweet_total_dict)
print('favorite_total_dict: ', favorite_total_dict)

length of tweets in month 2016_04: 132
length of tweets in month 2016_05: 148
length of tweets in month 2016_06: 127
length of tweets in month 2016_07: 139
length of tweets in month 2016_08: 107
length of tweets in month 2016_09: 128
length of tweets in month 2016_10: 119
length of tweets in month 2016_11: 115
length of tweets in month 2016_12: 127
length of tweets in month 2017_01: 59
length of tweets in month 2017_02: 65
length of tweets in month 2017_03: 90
length of tweets in month 2017_04: 59
length of tweets in month 2017_05: 67
length of tweets in month 2017_06: 88
length of tweets in month 2017_07: 74
length of tweets in month 2017_08: 89
length of tweets in month 2017_09: 47
length of tweets in month 2017_10: 50
length of tweets in month 2017_11: 76
length of tweets in month 2017_12: 122
length of tweets in month 2018_01: 111
length of tweets in month 2018_02: 103
length of tweets in month 2018_03: 102
length of tweets in month 2018_04: 99
length of tweets in month 2018_05: 81

### get the average number for favorite and retweet - build table in Overleaf

In [13]:
positive_retweet_num_list = [0 for i in range(len(year_month_key_list))]
neutral_retweet_num_list = [0 for i in range(len(year_month_key_list))]
negative_retweet_num_list = [0 for i in range(len(year_month_key_list))]

positive_favorite_num_list = [0 for i in range(len(year_month_key_list))]
neutral_favorite_num_list = [0 for i in range(len(year_month_key_list))]
negative_favorite_num_list = [0 for i in range(len(year_month_key_list))]

for year_month_index in range(len(year_month_key_list)):
    year_month = year_month_key_list[year_month_index]
    for i in range(len(sentiment_total_dict[year_month])):
        sentiment = sentiment_total_dict[year_month][i]
        retweet = retweet_total_dict[year_month][i]
        favorite = favorite_total_dict[year_month][i]
        
        if sentiment == 1:
            positive_retweet_num_list[year_month_index] += retweet
            positive_favorite_num_list[year_month_index] += favorite
        elif sentiment == 0:
            neutral_retweet_num_list[year_month_index] += retweet
            neutral_favorite_num_list[year_month_index] += favorite
        elif sentiment == -1:
            negative_retweet_num_list[year_month_index] += retweet
            negative_favorite_num_list[year_month_index] += favorite
        else:
            print('sentiment cannot be other nunmber besides 1, 0, -1')
        
print('positive_retweet_num_list: ', positive_retweet_num_list)
print('neutral_retweet_num_list: ', neutral_retweet_num_list)
print('negative_retweet_num_list: ', negative_retweet_num_list)

print('positive_favorite_num_list: ', positive_favorite_num_list)
print('neutral_favorite_num_list: ', neutral_favorite_num_list)
print('negative_favorite_num_list: ', negative_favorite_num_list)

positive_retweet_num_list:  [15, 13, 25, 17, 7, 7, 11, 4, 30, 15, 7, 11, 19, 9, 0, 5, 7, 1, 1, 3, 2, 35, 20, 5, 17, 7, 13, 12, 14, 11, 2, 149, 21, 2, 10, 3, 7]
neutral_retweet_num_list:  [26, 15, 41, 9, 6, 7, 5, 3, 3, 6, 7, 15, 3, 4, 2, 8, 35, 0, 1, 18, 5, 5, 21, 9, 67, 4, 83, 9, 11, 101, 116, 136, 25, 11, 43, 8, 3]
negative_retweet_num_list:  [0, 5, 3, 5, 0, 16, 0, 5, 19, 0, 0, 5, 0, 0, 4, 4, 0, 2, 18, 0, 2, 4, 1, 6, 0, 6, 23, 0, 0, 3, 1, 54, 8, 3, 7, 2, 31]
positive_favorite_num_list:  [60, 52, 74, 105, 43, 161, 64, 67, 355, 58, 20, 112, 66, 40, 38, 40, 102, 11, 26, 50, 80, 148, 265, 89, 87, 73, 232, 91, 183, 114, 51, 680, 150, 82, 129, 168, 161]
neutral_favorite_num_list:  [118, 158, 147, 64, 90, 72, 48, 51, 137, 31, 30, 49, 64, 36, 96, 29, 116, 13, 23, 166, 54, 42, 131, 63, 367, 36, 555, 157, 98, 1358, 895, 565, 90, 246, 206, 75, 99]
negative_favorite_num_list:  [15, 64, 31, 22, 5, 103, 7, 71, 82, 8, 19, 38, 7, 0, 43, 48, 11, 4, 67, 24, 19, 34, 46, 82, 17, 46, 84, 36, 60, 39, 43, 2

### weight = retweet + favorite

In [14]:
positive_weight_num_list = [0 for i in range(len(year_month_key_list))]
neutral_weight_num_list = [0 for i in range(len(year_month_key_list))]
negative_weight_num_list = [0 for i in range(len(year_month_key_list))]

for year_month_index in range(len(year_month_key_list)):
    year_month = year_month_key_list[year_month_index]
    for i in range(len(sentiment_total_dict[year_month])):
        sentiment = sentiment_total_dict[year_month][i]
        retweet = retweet_total_dict[year_month][i]
        favorite = favorite_total_dict[year_month][i]
        weight = retweet + favorite
        if sentiment == 1:
            positive_weight_num_list[year_month_index] += weight
        elif sentiment == 0:
            neutral_weight_num_list[year_month_index] += weight
        elif sentiment == -1:
            negative_weight_num_list[year_month_index] += weight
        else:
            print('sentiment cannot be other nunmber besides 1, 0, -1')
        
print('positive_weight_num_list: ', positive_weight_num_list)
print('neutral_weight_num_list: ', neutral_weight_num_list)
print('negative_weight_num_list: ', negative_weight_num_list)

positive_weight_num_list:  [75, 65, 99, 122, 50, 168, 75, 71, 385, 73, 27, 123, 85, 49, 38, 45, 109, 12, 27, 53, 82, 183, 285, 94, 104, 80, 245, 103, 197, 125, 53, 829, 171, 84, 139, 171, 168]
neutral_weight_num_list:  [144, 173, 188, 73, 96, 79, 53, 54, 140, 37, 37, 64, 67, 40, 98, 37, 151, 13, 24, 184, 59, 47, 152, 72, 434, 40, 638, 166, 109, 1459, 1011, 701, 115, 257, 249, 83, 102]
negative_weight_num_list:  [15, 69, 34, 27, 5, 119, 7, 76, 101, 8, 19, 43, 7, 0, 47, 52, 11, 6, 85, 24, 21, 38, 47, 88, 17, 52, 107, 36, 60, 42, 44, 339, 58, 82, 87, 31, 215]


### Debug - date after March, 2017 

In [61]:
def get_tweet_retweet_favorite_temp(tweet_id):
#     try:
    status = api.get_status(tweet_id)
    tweet = status._json
    retweet = tweet['retweet_count']
    favorite = tweet['favorite_count']
#     except:
#         retweet = 0
#         favorite = 0
    return retweet, favorite

for year_month in ['2017_04']:
    # e-cigarette
    file_name = 'juliana_SF_allECigarette_'+year_month+'_categoryKeyWord.json'
    
    # tobacco
#     file_name = 'juliana_allSF_tobacco_'+year_month+'.json'
    
    # flavored tobacco
#     file_name = 'juliana_allSF_flavored_tobacco_'+year_month+'.json'
    
    file = file_path + file_name
    with open(file, "r") as f:
        everyLines = f.readlines()
        num_of_tweets = len(everyLines)
        print('length of tweets in month %s: %d' % (year_month, num_of_tweets))
#         tweets_total_list.append(num_of_tweets)
        for line in everyLines:
            try:
                lineInJson = json.loads(line)
            except:
                print('Cannot load the line: ', line)
            
            # get retweet and favorite
            id_inLine = lineInJson['id']
            retweet_inLine, favorite_inLine = get_tweet_retweet_favorite_temp(id_inLine)
            print('favorite_inLine: ', favorite_inLine)
            break

length of tweets in month 2017_04: 59
favorite_inLine:  0


Tweepy API has rate limit. We may exceed the limit every 15 mins.

### e-cig weight list

In [60]:
ecig_positive_weight_list = [75, 66, 99, 122, 59, 168, 75, 71, 388, 73, 27, 123, 85, 49, 38, 46, 109, 12, 27, 53, 82, 183, 285, 94, 104, 80, 245, 103, 197, 125, 53, 830, 171, 84, 139, 171, 168]
ecig_neutral_weight_list = [144, 173, 188, 73, 96, 79, 53, 54, 140, 37, 37, 64, 67, 40, 98, 37, 151, 13, 24, 184, 59, 47, 152, 72, 434, 40, 638, 166, 109, 1459, 1010, 701, 115, 257, 250, 83, 102]
ecig_negative_weight_list = [15, 69, 34, 27, 5, 119, 7, 76, 101, 8, 19, 43, 7, 0, 47, 52, 11, 6, 85, 24, 21, 37, 47, 88, 17, 52, 107, 36, 60, 42, 44, 339, 58, 82, 87, 31, 215]

print('length of ecig_positive_weight_list: ', len(ecig_positive_weight_list))
print('length of ecig_neutral_weight_list: ', len(ecig_neutral_weight_list))
print('length of ecig_negative_weight_list: ', len(ecig_negative_weight_list))

length of ecig_positive_weight_list:  37
length of ecig_neutral_weight_list:  37
length of ecig_negative_weight_list:  37
