In [11]:
import os
import json
import pandas as pd
import numpy as np
import requests
import time

import tweepy

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
import re

from __private import CONSUMER_API_KEY
from __private import CONSUMER_API_KEY_SECRET
from __private import ACCESS_TOKEN
from __private import ACCESS_TOKEN_SECRET

In [4]:
# tweepy API
auth = tweepy.OAuthHandler(CONSUMER_API_KEY, CONSUMER_API_KEY_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)
    
def get_tweet_retweet_favorite_by_id(tweet_id):
    try:
        status = api.get_status(tweet_id)
        tweet = status._json
        retweet = tweet['retweet_count']
        favorite = tweet['favorite_count']
    except:
        retweet = 0
        favorite = 0
    return retweet, favorite

# get_tweet_retweet_favorite_by_id(726061616869961728)

In [5]:
# sentiment analysis
analyzer = SentimentIntensityAnalyzer()

def analyzeSentimentByVader_one_sentence(sentence):
    vs = analyzer.polarity_scores(sentence)

    if vs['compound'] >= 0.05:
        return 1 # positive
    elif vs['compound'] <= 0.05 and vs['compound'] > -0.05:
        return 0 # neutral
    elif vs['compound'] <= -0.05:
        return -1 # negative

    return -2 # should not be in here

In [12]:
year_month_key_list = ['2016_04','2016_05','2016_06','2016_07','2016_08','2016_09','2016_10','2016_11','2016_12',
                       '2017_01','2017_02', '2017_03','2017_04','2017_05','2017_06','2017_07','2017_08','2017_09',
                       '2017_10','2017_11','2017_12', '2018_01', '2018_02', '2018_03', '2018_04', '2018_05', 
                       '2018_06', '2018_07', '2018_08', '2018_09', '2018_10', '2018_11', '2018_12', '2019_01', 
                       '2019_02', '2019_03', '2019_04' ]
# year_month_key_list = ['2016_04']
file_path = "/mnt/volume-5T/result/"

In [13]:
def get_weight(year_month):
    positive_weight = 0
    neutral_weight = 0
    negative_weight = 0
    for i in range(len(sentiment_total_dict[year_month])):
        sentiment = sentiment_total_dict[year_month][i]
        retweet = retweet_total_dict[year_month][i]
        favorite = favorite_total_dict[year_month][i]
        weight = retweet + favorite
        if sentiment == 1:
            positive_weight += weight
        elif sentiment == 0:
            neutral_weight += weight
        elif sentiment == -1:
            negative_weight += weight
        else:
            print('sentiment cannot be other nunmber besides 1, 0, -1')
            
    print('positive_weight: ', positive_weight)
    print('neutral_weight: ', neutral_weight)
    print('negative_weight: ', negative_weight)
    return positive_weight, neutral_weight, negative_weight

In [14]:
text_total_dict = {}
sentiment_total_dict = {}
retweet_total_dict = {}
favorite_total_dict = {}

for year_month in year_month_key_list:
    
    # Wait for 20 mins
    time.sleep(1200)
    
    # e-cigarette
#     file_name = 'juliana_SF_allECigarette_'+year_month+'_categoryKeyWord.json'
    
    # tobacco
    file_name = 'juliana_allSF_tobacco_'+year_month+'.json'
    
    # flavored tobacco
#     file_name = 'juliana_allSF_flavored_tobacco_'+year_month+'.json'
    
    file = file_path + file_name
    
    # initialization
    text_total_dict[year_month] = []
    sentiment_total_dict[year_month] = []
    retweet_total_dict[year_month] = []
    favorite_total_dict[year_month] = []
    
    with open(file, "r") as f:
        everyLines = f.readlines()
        num_of_tweets = len(everyLines)
        print('length of tweets in month %s: %d' % (year_month, num_of_tweets))
#         tweets_total_list.append(num_of_tweets)
        for line in everyLines:
            try:
                lineInJson = json.loads(line)
            except:
                print('Cannot load the line: ', line)
                
            text_inLine = lineInJson["text"]
            text_afterRE = re.sub('https?:\/\/t\.co\/[\s\S]{10}', '', text_inLine)
            text_total_dict[year_month].append(text_afterRE)
            
            # get sentiment
            sentiment_sentence = analyzeSentimentByVader_one_sentence(text_afterRE)
            if sentiment_sentence == -2:
                print('Something wrong. The sentiment cannot be -2')
            else:
                sentiment_total_dict[year_month].append(sentiment_sentence)
            
            # get retweet and favorite
            id_inLine = lineInJson['id']
            retweet_inLine, favorite_inLine = get_tweet_retweet_favorite_by_id(id_inLine)
            retweet_total_dict[year_month].append(retweet_inLine)
            favorite_total_dict[year_month].append(favorite_inLine)
    
    get_weight(year_month)
    

    
#     break
    
print('length of text_total_dict: ', len(text_total_dict))
print('length of sentiment_total_dict: ', len(sentiment_total_dict))
print('sentiment_total_dict: ', sentiment_total_dict)
print('retweet_total_dict: ', retweet_total_dict)
print('favorite_total_dict: ', favorite_total_dict)

length of tweets in month 2016_04: 907
positive_weight:  6784
neutral_weight:  747
negative_weight:  502
length of tweets in month 2016_05: 930
positive_weight:  301
neutral_weight:  800
negative_weight:  299
length of tweets in month 2016_06: 974
positive_weight:  309
neutral_weight:  810
negative_weight:  221
length of tweets in month 2016_07: 899
positive_weight:  261
neutral_weight:  535
negative_weight:  165
length of tweets in month 2016_08: 888
positive_weight:  240
neutral_weight:  1181
negative_weight:  785
length of tweets in month 2016_09: 710
positive_weight:  216
neutral_weight:  1386
negative_weight:  216
length of tweets in month 2016_10: 775
positive_weight:  284
neutral_weight:  611
negative_weight:  253
length of tweets in month 2016_11: 731
positive_weight:  259
neutral_weight:  1606
negative_weight:  341
length of tweets in month 2016_12: 663
positive_weight:  117
neutral_weight:  557
negative_weight:  104
length of tweets in month 2017_01: 626
positive_weight:  176

### weight = retweet + favorite

In [15]:
positive_weight_num_list = [0 for i in range(len(year_month_key_list))]
neutral_weight_num_list = [0 for i in range(len(year_month_key_list))]
negative_weight_num_list = [0 for i in range(len(year_month_key_list))]

for year_month_index in range(len(year_month_key_list)):
    year_month = year_month_key_list[year_month_index]
    for i in range(len(sentiment_total_dict[year_month])):
        sentiment = sentiment_total_dict[year_month][i]
        retweet = retweet_total_dict[year_month][i]
        favorite = favorite_total_dict[year_month][i]
        weight = retweet + favorite
        if sentiment == 1:
            positive_weight_num_list[year_month_index] += weight
        elif sentiment == 0:
            neutral_weight_num_list[year_month_index] += weight
        elif sentiment == -1:
            negative_weight_num_list[year_month_index] += weight
        else:
            print('sentiment cannot be other nunmber besides 1, 0, -1')
        
print('positive_weight_num_list: ', positive_weight_num_list)
print('neutral_weight_num_list: ', neutral_weight_num_list)
print('negative_weight_num_list: ', negative_weight_num_list)

positive_weight_num_list:  [6784, 301, 309, 261, 240, 216, 284, 259, 117, 176, 142, 417, 332, 270, 245, 1375, 540, 86, 650, 231, 166, 240, 289, 169, 683, 1111, 385, 305, 174, 358, 421, 1141, 198, 165, 609, 262, 306]
neutral_weight_num_list:  [747, 800, 810, 535, 1181, 1386, 611, 1606, 557, 1297, 1227, 1632, 904, 1288, 760, 1701, 642, 1414, 852, 1983, 341, 523, 670, 655, 976, 784, 933, 4394, 787, 776, 2727, 2264, 413, 4178, 846, 494, 1598]
negative_weight_num_list:  [502, 299, 221, 165, 785, 216, 253, 341, 104, 167, 187, 309, 125, 639, 280, 3115, 329, 151, 225, 347, 482, 238, 189, 313, 433, 405, 392, 206, 340, 414, 370, 517, 336, 839, 298, 523, 1082]


### Debug - date after March, 2017 

In [61]:
def get_tweet_retweet_favorite_temp(tweet_id):
#     try:
    status = api.get_status(tweet_id)
    tweet = status._json
    retweet = tweet['retweet_count']
    favorite = tweet['favorite_count']
#     except:
#         retweet = 0
#         favorite = 0
    return retweet, favorite

for year_month in ['2017_04']:
    # e-cigarette
    file_name = 'juliana_SF_allECigarette_'+year_month+'_categoryKeyWord.json'
    
    # tobacco
#     file_name = 'juliana_allSF_tobacco_'+year_month+'.json'
    
    # flavored tobacco
#     file_name = 'juliana_allSF_flavored_tobacco_'+year_month+'.json'
    
    file = file_path + file_name
    with open(file, "r") as f:
        everyLines = f.readlines()
        num_of_tweets = len(everyLines)
        print('length of tweets in month %s: %d' % (year_month, num_of_tweets))
#         tweets_total_list.append(num_of_tweets)
        for line in everyLines:
            try:
                lineInJson = json.loads(line)
            except:
                print('Cannot load the line: ', line)
            
            # get retweet and favorite
            id_inLine = lineInJson['id']
            retweet_inLine, favorite_inLine = get_tweet_retweet_favorite_temp(id_inLine)
            print('favorite_inLine: ', favorite_inLine)
            break

length of tweets in month 2017_04: 59
favorite_inLine:  0


Tweepy API has rate limit. We may exceed the limit every 15 mins.

### tobacco weight list

In [16]:
tobacco_positive_weight_list = [6784, 301, 309, 261, 240, 216, 284, 259, 117, 176, 142, 417, 332, 270, 245, 1375, 540, 86, 650, 231, 166, 240, 289, 169, 683, 1111, 385, 305, 174, 358, 421, 1141, 198, 165, 609, 262, 306]
tobacco_neutral_weight_list = [747, 800, 810, 535, 1181, 1386, 611, 1606, 557, 1297, 1227, 1632, 904, 1288, 760, 1701, 642, 1414, 852, 1983, 341, 523, 670, 655, 976, 784, 933, 4394, 787, 776, 2727, 2264, 413, 4178, 846, 494, 1598]
tobacco_negative_weight_list = [502, 299, 221, 165, 785, 216, 253, 341, 104, 167, 187, 309, 125, 639, 280, 3115, 329, 151, 225, 347, 482, 238, 189, 313, 433, 405, 392, 206, 340, 414, 370, 517, 336, 839, 298, 523, 1082]

print('length of tobacco_positive_weight_list: ', len(tobacco_positive_weight_list))
print('length of tobacco_neutral_weight_list: ', len(tobacco_neutral_weight_list))
print('length of tobacco_negative_weight_list: ', len(tobacco_negative_weight_list))

length of tobacco_positive_weight_list:  37
length of tobacco_neutral_weight_list:  37
length of tobacco_negative_weight_list:  37


### get the average number for favorite and retweet - build table in Overleaf

In [19]:
retweet_per_month_sum_list = []
favorite_per_month_sum_list = []
for year_month in year_month_key_list:
    retweet_per_month_value = sum(retweet_total_dict[year_month])
    favorite_per_month_value = sum(favorite_total_dict[year_month])
    retweet_per_month_sum_list.append(retweet_per_month_value)
    favorite_per_month_sum_list.append(favorite_per_month_value)

In [20]:
retweet_per_month_sum_list

[3828,
 252,
 276,
 112,
 393,
 201,
 195,
 216,
 179,
 276,
 258,
 362,
 210,
 398,
 133,
 678,
 144,
 444,
 375,
 221,
 93,
 135,
 150,
 106,
 347,
 277,
 221,
 267,
 130,
 201,
 245,
 703,
 118,
 288,
 188,
 100,
 318]

In [21]:
favorite_per_month_sum_list

[4205,
 1148,
 1064,
 849,
 1813,
 1617,
 953,
 1990,
 599,
 1364,
 1298,
 1996,
 1151,
 1799,
 1152,
 5513,
 1367,
 1207,
 1352,
 2340,
 896,
 866,
 998,
 1031,
 1745,
 2023,
 1489,
 4638,
 1171,
 1347,
 3273,
 3219,
 829,
 4894,
 1565,
 1179,
 2668]