<img src="https://datasciencedegree.wisconsin.edu/wp-content/themes/data-gulp/images/logo.svg" width="300">


# Final project DS710 - Aditya Nanduri

In [1]:
import tweepy
import json
import pandas as pd
import preprocessor as p
import numpy as np
import os
import re
import string
from textblob import TextBlob
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

#### Loading  credentials from the external file

Invoking a python plain text source file located somewhere else on my computer.

In [2]:
%run ~/downloads/twitter_credentials.py
# this cell will evaluate silently 🙊, and not print anything.  
# This is desired, because a person with your keys can act as you on Twitter in literally every way 😟

🔐 If you need to check whether the four variables, such as `con_key` have the correct value, insert a cell and print the value, then delete the cell.  Keep your credentials secret and safe!!!  

#### 3. Make an `API` object

The `tweepy.API` object handles construction of the Twitter API calls for you.  It's a convenience layer, but it's really dang convenient!

In [3]:
#Use tweepy.OAuthHandler to create an authentication using the given key and secret
auth = tweepy.OAuthHandler(consumer_key=con_key, consumer_secret=con_secret)
auth.set_access_token(acc_token, acc_secret)

#Connect to the Twitter API using the authentication
api = tweepy.API(auth)

### The REST API

The REST API allows you to _pull_ information from Twitter, or _push_ information back to Twitter.  We'll use the REST API to run a specific search.  

In [4]:
#Using the REST API for a static search
#Search query words , filtering out re-tweets .
trump_keywords = '#trump OR #donaldtrump OR #republican OR #keepamericagreat OR #maga OR #conservative OR #trump2020 OR #trumpforpresident OR #fakepresident OR #dumptrump OR #impeachtrump OR #notmypresident -filter:retweets -filter:media'
    joebiden_keywords = '#biden OR #democrats OR #voteblue OR #voteforblue OR #joebidenforpresident OR #joebiden OR #joe OR #josephbiden OR #bidenforusa OR #creepyjoebiden OR #bidendropout OR #bideniscorrupt OR #biden2020 -filter:retweets -filter:media'
#trump_keywords = '#trump2020'
#joebiden_keywords = '#biden2020' 


### For Cleaning text - Removing Stop words, emoticons and Punctuations

In [5]:

# Happy Emoticons
emoticons= set([
    ':-)', ':)', ';)', ':o)', ':]', ':3', ':c)', ':>', '=]', '8)', '=)', ':}',
    ':^)', ':-D', ':D', '8-D', '8D', 'x-D', 'xD', 'X-D', 'XD', '=-D', '=D',
    '=-3', '=3', ':-))', ":'-)", ":')", ':*', ':^*', '>:P', ':-P', ':P', 'X-P',
    'x-p', 'xp', 'XP', ':-p', ':p', '=p', ':-b', ':b', '>:)', '>;)', '>:-)',
    '<3',':L', ':-/', '>:/', ':S', '>:[', ':@', ':-(', ':[', ':-||', '=L', ':<',
    ':-[', ':-<', '=\\', '=/', '>:(', ':(', '>.<', ":'-(", ":'(", ':\\', ':-c',
    ':c', ':{', '>:\\', ';('
    ])

#Emoji patterns
emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)

#method clean_tweets() - remove stop words, punctuation and emoticons
def clean_tweets(tweet):
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(tweet)

    #after tweepy preprocessing the colon left remain after removing mentions
    #or RT sign in the beginning of the tweet
    tweet = re.sub(r':', '', tweet)
    tweet = re.sub(r'‚Ä¶', '', tweet)
    #replace consecutive non-ASCII characters with a space
    tweet = re.sub(r'[^\x00-\x7F]+',' ', tweet)


    #remove emojis from tweet
    tweet = emoji_pattern.sub(r'', tweet)

    #filter using NLTK library append it to a string
    filtered_tweet = [w for w in word_tokens if not w in stop_words]
    filtered_tweet = []
 
    #removing stop words, emoticons and punctuations this will help in evaluating sentiment scores in a better way
    for w in word_tokens:
        #check tokens against stop words , emoticons and punctuations
        if w not in stop_words and w not in emoticons and w not in string.punctuation:
            filtered_tweet.append(w)
    return ' '.join(filtered_tweet)

### Get Cleaned Tweets with Sentiment score

In [6]:
def get_clean_tweets(search_keyword,num_of_tweets):
    tweet_list = []
    last_id = -1 # id of last tweet seen
    while len(tweet_list) < num_of_tweets:
        try:
            new_tweets = api.search(q = search_keyword, count = 100, lang = 'en',max_id = str(last_id - 1))
        except tweepy.TweepError as e:
            print("Error", e)
            break
        else:
            if not new_tweets:
                print("Could not find any more tweets!")
                break
            tweet_list.extend(new_tweets)
            last_id = new_tweets[-1].id
    all_tweet_list = []
# loop Json file and extract information we need and store in a data frame
    for i in range(0,len(tweet_list)):
        filtered_tweet=clean_tweets(p.clean(tweet_list[i].text))
        blob = TextBlob(filtered_tweet)
        if tweet_list[i].place:
            country = (tweet_list[i].place.country_code)
        else:
            country = np.nan 
        all_tweet_list.append([(i),
                          (filtered_tweet),
                          (tweet_list[i].user.screen_name),
                          (tweet_list[i].created_at),
                          (tweet_list[i].retweeted),
                          (tweet_list[i].retweet_count),
                          (tweet_list[i].favorite_count),
                          (country),
                          (blob.sentiment),
                          (blob.sentiment.polarity),
                         (blob.sentiment.subjectivity)])
        # Creating a dataframe object from list
    df_tweets = pd.DataFrame(all_tweet_list) 
    df_tweets.columns = ['Index','Text', 'screen_name','created_at','Is_retweeted','retweet_count','favorite_count','country','Sentiment','polarity','subjectivity']
    return(df_tweets)

In [7]:
trump_df = get_clean_tweets(trump_keywords,14000)
len(trump_df)

14009

In [9]:
biden_df = get_clean_tweets(joebiden_keywords,14000)
len(biden_df)

14033

In [10]:
# Adding Sentiment category based on sentiment. If score is positive value category would be Positive
# similary negative category for Negative score and Neutral for 0 score
trump_df['attitude'] = np.where(trump_df.polarity == 0, 'Neutral',
                           np.where(trump_df.polarity > 0,'Positive', 'Negative'))
trump_df['candidate'] = 'Trump'


In [11]:
# Adding Sentiment category based on sentiment. If score is positive value category would be Positive
# similary negative category for Negative score and Neutral for 0 score
biden_df['attitude'] = np.where(biden_df.polarity == 0, 'Neutral',
                           np.where(biden_df.polarity > 0,'Positive', 'Negative'))
biden_df['candidate'] = 'Biden'

In [12]:
import csv
trump_df.to_csv("Trumprelatedtweets.csv", index=False)
biden_df.to_csv("Bidenrelatedtweets.csv", index=False)