In [1]:
# ------------------------ REFERENCES ------------------------ #
# "NLP Tutorial 10 - Real Time Twitter Sentiment Analysis | Donald Trump vs Warren Twitter Sentiment". YouTube, uploaded by KGP Talkie, Nov 2, 2019, URL: https://youtu.be/gUFDtuz73gI

# Authentication

In [2]:
consumer_key = '<your consumer key>'
consumer_secret = '<your consumer secret>'
access_token = '<your access token>'
access_token_secret = '<your access token secret>'
bearer_token = '<your bearer token>'

In [3]:
import tweepy

In [4]:
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_token_secret)

In [5]:
api = tweepy.API(auth)

In [6]:
import pandas as pd

# UPDATE POSITIVE/NEGATIVE VALUES ON START OF PROGRAM FROM DB
Province_indexes = {'KPK':0,'FATA':1,'ISLAMABAD':2,'PUNJAB':3,'SINDH':4,'BALOCHISTAN':5}
df = pd.read_csv('IK_tweets_provincial_stats.csv')
print(df)

# Provinces = ['KPK','FATA','ISLAMABAD','PUNJAB','SINDH','BALOCHISTAN']
# Positive = {'KPK':0,'FATA':0,'ISLAMABAD':0,'PUNJAB':0,'SINDH':0,'BALOCHISTAN':0}
# Negative = {'KPK':0,'FATA':0,'ISLAMABAD':0,'PUNJAB':0,'SINDH':0,'BALOCHISTAN':0}
# Data = {'Province': Provinces, 'Positive': list(Positive.values()), 'Negative': list(Negative.values())}
# df = pd.DataFrame(Data, columns= ['Province', 'Positive', 'Negative'])

      Province  Positive  Negative  Neutral
0          KPK       130       101      186
1         FATA        21        18       29
2    ISLAMABAD         8         5        9
3       PUNJAB       455       350      715
4        SINDH       203       161      326
5  BALOCHISTAN        43        28       80


# Sentiment analysis functions

In [7]:
from textblob import TextBlob

def get_sentiment_textBlob(text, province):
    # create TextBlob object of passed tweet text
    analysis = TextBlob(text)
    # set sentiment
    sentiment = ''
    subjectivity = ''
    if analysis.sentiment.polarity > 0:
        sentiment = 'positive 🙂'
        df['Positive'][Province_indexes[province]] += 1
    elif analysis.sentiment.polarity == 0:
        sentiment = 'neutral 😶'
        df['Neutral'][Province_indexes[province]] += 1
    else:
        sentiment = 'negative 😠'
        df['Negative'][Province_indexes[province]] += 1

    if analysis.sentiment.subjectivity > 0.5:
        subjectivity = '🗣 opinion'
    elif analysis.sentiment.polarity < 0.5:
        subjectivity = '📊 factual '
    else:
        subjectivity = '😶 neutral'
    
    return sentiment + ' : ' + subjectivity

In [8]:
print('🙂')

🙂


# Location

In [9]:
# def get_location():
#     return random.choice(['KPK','FATA','ISLAMABAD','PUNJAB','SINDH','BALOCHISTAN'])

def get_location():
    import random
    return (random.choices(df['Province'], weights=(30508920, 4993044, 2003368, 109989655, 47854510, 12335129), k=1))[0]

# Streaming helper functions

In [10]:
# SOURCE: 'Twitter-API-v2-sample-code'. GitHub, uploaded by EthanJWright et al. 28 Aug, 2020. URL: 'https://github.com/twitterdev/Twitter-API-v2-sample-code/blob/master/Filtered-Stream/filtered_stream.py' [Accessed: 17 June 2021]

import requests
import os
import json

def create_headers(bearer_token):
    headers = {"Authorization": "Bearer {}".format(bearer_token)}
    return headers


def get_rules(headers, bearer_token):
    response = requests.get(
        "https://api.twitter.com/2/tweets/search/stream/rules", headers=headers
    )
    if response.status_code != 200:
        raise Exception(
            "Cannot get rules (HTTP {}): {}".format(response.status_code, response.text)
        )
    print(json.dumps(response.json()))
    return response.json()


def delete_all_rules(headers, bearer_token, rules):
    if rules is None or "data" not in rules:
        return None

    ids = list(map(lambda rule: rule["id"], rules["data"]))
    payload = {"delete": {"ids": ids}}
    response = requests.post(
        "https://api.twitter.com/2/tweets/search/stream/rules",
        headers=headers,
        json=payload
    )
    if response.status_code != 200:
        raise Exception(
            "Cannot delete rules (HTTP {}): {}".format(
                response.status_code, response.text
            )
        )
    print(json.dumps(response.json()))



def set_rules(headers, delete, bearer_token):
    # You can adjust the rules if needed
    sample_rules = [
        # {"value": "politics", "tag": "Imran Khan"},
        {"value": "Imran Khan", "tag": "politics", "keyword": "Imran Khan", "lang": "en"},
        # {"value": "politics", "tag": "#ImranKhan"},                       # THIS DOESN'T WORK
        # {"value": "dog has:images", "tag": "dog pictures"},
        # {"value": "cat has:images -grumpy", "tag": "cat pictures"},
    ]
    payload = {"add": sample_rules}
    response = requests.post(
        "https://api.twitter.com/2/tweets/search/stream/rules",
        headers=headers,
        json=payload,
    )
    if response.status_code != 201:
        raise Exception(
            "Cannot add rules (HTTP {}): {}".format(response.status_code, response.text)
        )
    print(json.dumps(response.json()))


# PREPROCESSING

In [11]:
import re
import nltk
import emoji
from nltk.stem import WordNetLemmatizer
from nltk.corpus import words

words = set(words.words())
def get_tweet_text(tweet_details):
    # REGEX SOURCES: 
    # 'Extract string after colon or parenthesis with regex in python', question on Stackoverflow, answered by Gustav Rasmussen, 10 July 2020, URL: https://stackoverflow.com/questions/62831803/extract-string-after-colon-or-parenthesis-with-regex-in-python, [Accessed: 17 June 2021]
    # 'RegEx: Grabbing values between quotation marks', question on Stackoverflow answered by Adam, 5 October 2008, URL: https://stackoverflow.com/questions/171480/regex-grabbing-values-between-quotation-marks, [Accessed: 17 June 2021]
    # 'Regex to get words after matching string', question on Stackoverflow answered by Himanshu Chauhan, 20 September 2016, URL: https://stackoverflow.com/questions/19193251/regex-to-get-the-words-after-matching-string, [Accessed: 17 June 2021]
    regex = re.compile( r"(?<=\"text\":\s).*([\"\"])(?:(?=(\\?)).)*?", re.I )
    return re.search(regex, tweet_details).group(0)

# 'Cleaning Twitter data pandas python', question on Stackoverflow answered by Celius Stingher, 6 November 2020, URL: https://stackoverflow.com/questions/64719706/cleaning-twitter-data-pandas-python, [Accessed: 18 June 2021]
# 'How to get rid of punctuation using NLTK tokenizer?', question on Stackoverflow answered by rmalouf, 21 March 2013, URL: https://stackoverflow.com/questions/15547409/how-to-get-rid-of-punctuation-using-nltk-tokenizer, [Accessed: 18 June 2021]
def cleaner(tweet):
    tweet= re.sub(r"\\n","",tweet)                                      # Removing \n
    tweet= re.sub(r"\\u....","",tweet)                                  # Removing unicode
    tweet= re.sub(r"RT[\s]+","",tweet)                                  # Removing RT
    tweet = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", tweet)    # Remove http links and mentions
    tweet = " ".join(tweet.split())
    tweet = ''.join(c for c in tweet if c not in emoji.UNICODE_EMOJI)   # Remove Emojis
    tweet = tweet.replace("#", "").replace("_", " ")                    # Remove hashtag sign but keep the text
    tweet = tweet.lower()
    tokenizer = RegexpTokenizer(r'\w+')                               # |
    tweet = " ".join(tokenizer.tokenize(tweet))                       # Remove punctuation marks

    return tweet
    
    # tweet = " ".join(w for w in nltk.wordpunct_tokenize(tweet) \
    #      if w.lower() in words or not w.isalpha())
    # for w in nltk.wordpunct_tokenize(tweet)

def half_cleaner(tweet):
    tweet= re.sub(r"\\n","",tweet)                                      # Removing \n
    tweet= re.sub(r"\\u....","",tweet)                                  # Removing unicode
    return tweet

def lemmatize(text):
    new_text = []
    words = nltk.wordpunct_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    for word in words:
        new_text.append(lemmatizer.lemmatize(word))

    new_text = " ".join(new_text)
    return new_text


# Main streaming function

In [12]:
import csv
import random
import time
from nltk.tokenize import RegexpTokenizer

field_names = ['tweet', 'location', 'sentiment']

def get_stream(headers, set, bearer_token):
    response = requests.get(
        "https://api.twitter.com/2/tweets/search/stream", headers=headers, stream=True,
    )
    print(response.status_code)
    if response.status_code != 200:
        raise Exception(
            "Cannot get stream (HTTP {}): {}".format(
                response.status_code, response.text
            )
        )
    tweetNo = 0
    # Below line gives an error sometimes. This is a twitter API issue being faced by multiple developers and is currently being worked on. SOURCE: https://twittercommunity.com/t/requests-exceptions-chunkedencodingerror/150530 [Accessed: 18 June 2021]
    for response_line in response.iter_lines():
        if response_line:
            tweetNo += 1
            json_response = json.loads(response_line)
            tempTweet = json.dumps(json_response, indent=4, sort_keys=True)

            # --VARIABLES--
            tweet = {}                                  # of the form {'location': ,'tweet':, 'sentiment':}
            tweet['location'] = get_location()
            text = get_tweet_text(tempTweet)
            cleaned_text = cleaner(text)
            lemmatized_cleaned_text = lemmatize(cleaned_text)
            sentiment = get_sentiment_textBlob(cleaned_text, tweet['location'])
            tweet['tweet'] = lemmatized_cleaned_text
            tweet['sentiment'] = sentiment
            # --PRINT--
            print('Tweet ', tweetNo, ':\n', tempTweet)
            print('TWEET TEXT: ', text)
            print('CLEANED TEXT: ', cleaned_text)
            print('CLEANED AND LEMMATIZED: ', lemmatized_cleaned_text)
            print('SENTIMENT: ', sentiment)
            print('LOCATION: ', tweet['location'])
            # --SAVE--            
            df.to_csv('IK_tweets_provincial_stats.csv', index = False, header=True)
            f_obj = open('tweet.txt', 'w')
            # sentiment = ''.join(c for c in sentiment if c not in emoji.UNICODE_EMOJI)   # Remove Emojis
            sentiment = re.sub(r"..:..", ' | ', sentiment)
            half_cleaned_text = half_cleaner(text)
            f_obj.write(sentiment+'\n'+half_cleaned_text)
            f_obj.close()
            # with open('IK_tweets.csv', 'a') as csv_file:
            #     csv_writer = csv.DictWriter(csv_file, fieldnames = field_names)
            #     csv_writer.writerow(tweet)
            #     print('Tweet written to CSV...')
    return

# STREAM!

In [13]:
data = api.rate_limit_status()

print (data['resources']['statuses']['/statuses/home_timeline'])
print (data['resources']['users']['/users/lookup'])

{'limit': 15, 'remaining': 15, 'reset': 1624296800}
{'limit': 900, 'remaining': 900, 'reset': 1624296800}


In [14]:
def main():
    bearer_token = '<your bearer token>'
    headers = create_headers(bearer_token)
    rules = get_rules(headers, bearer_token)
    delete = delete_all_rules(headers, bearer_token, rules)
    set = set_rules(headers, delete, bearer_token)
    get_stream(headers, set, bearer_token)

if __name__ == "__main__":
    main()

{"data": [{"id": "1406961359074385923", "value": "Imran Khan", "tag": "politics"}], "meta": {"sent": "2021-06-21T17:18:22.248Z"}}
{"meta": {"sent": "2021-06-21T17:18:23.256Z", "summary": {"deleted": 1, "not_deleted": 0}}}
{"data": [{"value": "Imran Khan", "tag": "politics", "id": "1407025118488895500"}], "meta": {"sent": "2021-06-21T17:18:24.200Z", "summary": {"created": 1, "not_created": 0, "valid": 1, "invalid": 0}}}
200
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Negative'][Province_indexes[province]] += 1
Tweet  1 :
 {
    "data": {
        "id": "1407025093293842436",
        "text": "RT @AdnanhussainAD: \"If a woman is wearing v few clothes, it will have an impact on the men, unless they're robots\" PM Imran Khan https://t\u2026"
    },
    "matching_rules": [
        {
            "id": 1407025118488895500,
      

ChunkedEncodingError: ("Connection broken: InvalidChunkLength(got length b'', 0 bytes read)", InvalidChunkLength(got length b'', 0 bytes read))

15 15


15 15
