In [1]:
import os
import json
import pandas as pd
import numpy as np
import requests
import time

import tweepy

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer 
import re

from __private import CONSUMER_API_KEY
from __private import CONSUMER_API_KEY_SECRET
from __private import ACCESS_TOKEN
from __private import ACCESS_TOKEN_SECRET

In [2]:
# tweepy API
auth = tweepy.OAuthHandler(CONSUMER_API_KEY, CONSUMER_API_KEY_SECRET)
auth.set_access_token(ACCESS_TOKEN, ACCESS_TOKEN_SECRET)
api = tweepy.API(auth)
    
def get_tweet_retweet_favorite_by_id(tweet_id):
    try:
        status = api.get_status(tweet_id)
        tweet = status._json
        retweet = tweet['retweet_count']
        favorite = tweet['favorite_count']
    except:
        retweet = 0
        favorite = 0
    return retweet, favorite

# get_tweet_retweet_favorite_by_id(726061616869961728)

In [3]:
# sentiment analysis
analyzer = SentimentIntensityAnalyzer()

def analyzeSentimentByVader_one_sentence(sentence):
    vs = analyzer.polarity_scores(sentence)

    if vs['compound'] >= 0.05:
        return 1 # positive
    elif vs['compound'] <= 0.05 and vs['compound'] > -0.05:
        return 0 # neutral
    elif vs['compound'] <= -0.05:
        return -1 # negative

    return -2 # should not be in here

In [16]:
# year_month_key_list = ['2016_04','2016_05','2016_06','2016_07','2016_08','2016_09','2016_10','2016_11','2016_12',
#                        '2017_01','2017_02', '2017_03','2017_04','2017_05','2017_06','2017_07','2017_08','2017_09',
#                        '2017_10','2017_11','2017_12', '2018_01', '2018_02', '2018_03', '2018_04', '2018_05', 
#                        '2018_06', '2018_07', '2018_08', '2018_09', '2018_10', '2018_11', '2018_12', '2019_01', 
#                        '2019_02', '2019_03', '2019_04' ]
year_month_key_list = ['2019_01', '2019_02', '2019_03', '2019_04' 
                       ]
# year_month_key_list = ['2016_04']
file_path = "/mnt/volume-5T/result/"

In [5]:
def get_weight(year_month):
    positive_weight = 0
    neutral_weight = 0
    negative_weight = 0
    for i in range(len(sentiment_total_dict[year_month])):
        sentiment = sentiment_total_dict[year_month][i]
        retweet = retweet_total_dict[year_month][i]
        favorite = favorite_total_dict[year_month][i]
        weight = retweet + favorite
        if sentiment == 1:
            positive_weight += weight
        elif sentiment == 0:
            neutral_weight += weight
        elif sentiment == -1:
            negative_weight += weight
        else:
            print('sentiment cannot be other nunmber besides 1, 0, -1')
            
    print('positive_weight: ', positive_weight)
    print('neutral_weight: ', neutral_weight)
    print('negative_weight: ', negative_weight)
    return positive_weight, neutral_weight, negative_weight

In [17]:
text_total_dict = {}
sentiment_total_dict = {}
retweet_total_dict = {}
favorite_total_dict = {}

for year_month in year_month_key_list:
    
    # e-cigarette
#     file_name = 'juliana_SF_allECigarette_'+year_month+'_categoryKeyWord.json'
    
    # tobacco
#     file_name = 'juliana_allSF_tobacco_'+year_month+'.json'
    
    # flavored tobacco
    file_name = 'juliana_allSF_flavored_tobacco_'+year_month+'.json'
    
    file = file_path + file_name
    
    # initialization
    text_total_dict[year_month] = []
    sentiment_total_dict[year_month] = []
    retweet_total_dict[year_month] = []
    favorite_total_dict[year_month] = []
    
    with open(file, "r") as f:
        everyLines = f.readlines()
        num_of_tweets = len(everyLines)
        print('length of tweets in month %s: %d' % (year_month, num_of_tweets))
#         tweets_total_list.append(num_of_tweets)
        for line in everyLines:
            try:
                lineInJson = json.loads(line)
            except:
                print('Cannot load the line: ', line)
                
            text_inLine = lineInJson["text"]
            text_afterRE = re.sub('https?:\/\/t\.co\/[\s\S]{10}', '', text_inLine)
            text_total_dict[year_month].append(text_afterRE)
            
            # get sentiment
            sentiment_sentence = analyzeSentimentByVader_one_sentence(text_afterRE)
            if sentiment_sentence == -2:
                print('Something wrong. The sentiment cannot be -2')
            else:
                sentiment_total_dict[year_month].append(sentiment_sentence)
            
            # get retweet and favorite
            id_inLine = lineInJson['id']
            retweet_inLine, favorite_inLine = get_tweet_retweet_favorite_by_id(id_inLine)
            retweet_total_dict[year_month].append(retweet_inLine)
            favorite_total_dict[year_month].append(favorite_inLine)
    
    get_weight(year_month)
    
#     # Wait for 20 mins
#     time.sleep(1200)
    
#     break
    
print('length of text_total_dict: ', len(text_total_dict))
print('length of sentiment_total_dict: ', len(sentiment_total_dict))
print('sentiment_total_dict: ', sentiment_total_dict)
print('retweet_total_dict: ', retweet_total_dict)
print('favorite_total_dict: ', favorite_total_dict)

length of tweets in month 2019_01: 42
positive_weight:  19
neutral_weight:  30
negative_weight:  323
length of tweets in month 2019_02: 36
positive_weight:  4
neutral_weight:  174
negative_weight:  20
length of tweets in month 2019_03: 48
positive_weight:  121
neutral_weight:  89
negative_weight:  9
length of tweets in month 2019_04: 43
positive_weight:  8
neutral_weight:  197
negative_weight:  2
length of text_total_dict:  4
length of sentiment_total_dict:  4
sentiment_total_dict:  {'2019_01': [1, -1, 1, 0, 0, 0, 0, 1, -1, -1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, -1, 0, 0, 0, -1, 1, 1, 0, 0, 0, -1, 0, -1, 0, 0], '2019_02': [1, -1, 1, 0, 0, 0, 1, 1, 1, 0, 0, -1, 0, 0, 0, 1, 1, -1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, -1, -1, 1], '2019_03': [1, 0, 0, 0, 0, -1, 0, 1, 0, 0, -1, -1, -1, -1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, -1, 0, 1, -1, 0, 0, 0, 0, 0, 0, 0, 1, 1, -1, 0, -1, -1, 0], '2019_04': [0, 0, 0, -1, 0, 0, 1, 1, 0, 0, -1, 1, 0, 0, 0, 1, 0, -1, 0, 

### weight = retweet + favorite

In [18]:
positive_weight_num_list = [0 for i in range(len(year_month_key_list))]
neutral_weight_num_list = [0 for i in range(len(year_month_key_list))]
negative_weight_num_list = [0 for i in range(len(year_month_key_list))]

for year_month_index in range(len(year_month_key_list)):
    year_month = year_month_key_list[year_month_index]
    for i in range(len(sentiment_total_dict[year_month])):
        sentiment = sentiment_total_dict[year_month][i]
        retweet = retweet_total_dict[year_month][i]
        favorite = favorite_total_dict[year_month][i]
        weight = retweet + favorite
        if sentiment == 1:
            positive_weight_num_list[year_month_index] += weight
        elif sentiment == 0:
            neutral_weight_num_list[year_month_index] += weight
        elif sentiment == -1:
            negative_weight_num_list[year_month_index] += weight
        else:
            print('sentiment cannot be other nunmber besides 1, 0, -1')
        
print('positive_weight_num_list: ', positive_weight_num_list)
print('neutral_weight_num_list: ', neutral_weight_num_list)
print('negative_weight_num_list: ', negative_weight_num_list)

positive_weight_num_list:  [19, 4, 121, 8]
neutral_weight_num_list:  [30, 174, 89, 197]
negative_weight_num_list:  [323, 20, 9, 2]


### Debug - date after March, 2017 

In [61]:
def get_tweet_retweet_favorite_temp(tweet_id):
#     try:
    status = api.get_status(tweet_id)
    tweet = status._json
    retweet = tweet['retweet_count']
    favorite = tweet['favorite_count']
#     except:
#         retweet = 0
#         favorite = 0
    return retweet, favorite

for year_month in ['2017_04']:
    # e-cigarette
    file_name = 'juliana_SF_allECigarette_'+year_month+'_categoryKeyWord.json'
    
    # tobacco
#     file_name = 'juliana_allSF_tobacco_'+year_month+'.json'
    
    # flavored tobacco
#     file_name = 'juliana_allSF_flavored_tobacco_'+year_month+'.json'
    
    file = file_path + file_name
    with open(file, "r") as f:
        everyLines = f.readlines()
        num_of_tweets = len(everyLines)
        print('length of tweets in month %s: %d' % (year_month, num_of_tweets))
#         tweets_total_list.append(num_of_tweets)
        for line in everyLines:
            try:
                lineInJson = json.loads(line)
            except:
                print('Cannot load the line: ', line)
            
            # get retweet and favorite
            id_inLine = lineInJson['id']
            retweet_inLine, favorite_inLine = get_tweet_retweet_favorite_temp(id_inLine)
            print('favorite_inLine: ', favorite_inLine)
            break

length of tweets in month 2017_04: 59
favorite_inLine:  0


Tweepy API has rate limit. We may exceed the limit every 15 mins.

### flavored tobacco weight list

In [19]:
flavored_tobacco_positive_weight_list = [20, 21, 14, 30, 15, 20, 27, 9, 6, 9, 9, 34, 118, 25, 23, 10, 9, 8, 19, 27, 15, 27, 12, 12, 15, 6, 21, 10, 10, 15, 33, 6, 45, 19, 4, 121, 8]
flavored_tobacco_neutral_weight_list = [70, 54, 35, 82, 53, 30, 42, 40, 21, 82, 117, 34, 108, 18, 11, 29, 16, 72, 14, 29, 32, 68, 44, 58, 12, 26, 69, 53, 76, 98, 216, 17, 30, 30, 174, 89, 197]
flavored_tobacco_negative_weight_list = [6, 17, 48, 0, 8, 4, 5, 9, 8, 20, 8, 0, 9, 5, 83, 2, 113, 7, 5, 13, 10, 8, 26, 13, 15, 8, 4, 3, 4, 18, 0, 5, 10, 323, 20, 9, 2]

print('length of flavored_tobacco_positive_weight_list: ', len(flavored_tobacco_positive_weight_list))
print('length of flavored_tobacco_neutral_weight_list: ', len(flavored_tobacco_neutral_weight_list))
print('length of flavored_tobacco_negative_weight_list: ', len(flavored_tobacco_negative_weight_list))

length of flavored_tobacco_positive_weight_list:  37
length of flavored_tobacco_neutral_weight_list:  37
length of flavored_tobacco_negative_weight_list:  37


### get the average number for favorite and retweet - build table in Overleaf