In [1]:
#Import Libraries
import tweepy
import pandas as pd 
import numpy as np
from textblob import TextBlob
import jsonpickle
import json
import codecs, csv
import html
import re
import sys
import os

In [2]:
#Load credentials from external file
%run ~/Documents/twitter_credentials.py
# this cell will evaluate silently 🙊, and not print anything.  
# This is desired, because a person with your keys can act as you on Twitter in literally every way 😟

In [3]:
#Use tweepy.OAuthHandler to create an authentication using the given key and secret
auth = tweepy.OAuthHandler(consumer_key=con_key, consumer_secret=con_secret)
auth.set_access_token(acc_token, acc_secret)

#Connect to the Twitter API using the authentication
#Waits until code has reached my max count
#Notifies me when it's reach my rate limit
api = tweepy.API(auth, wait_on_rate_limit=True,wait_on_rate_limit_notify=True)

In [4]:
#Create csv file with column headers
with codecs.open('FoxNews_CNN.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['ID_String','Text'])

In [5]:
tweetCount = 0

#Open csv file to append
with open('FoxNews_CNN.csv', 'a') as f:

    #Tell the Cursor method that we want to use the Search API (api.search)
    #Also tell Cursor our query, filter on retweets, filter on tweets from the 2 news orgs,
    #and the maximum number of tweets to return
    for tweet in tweepy.Cursor(api.search,q='@foxnews OR @cnn -filter:retweets -from:foxnews -from:cnn').items(10000) :    
                 
        #Write to the csv file, and add one to the number of tweets we've collected
        writer = csv.writer(f)
        writer.writerow([tweet.id_str, tweet.text])
        tweetCount += 1

    #Display how many tweets we have collected
    print("Downloaded {0} tweets".format(tweetCount))

Rate limit reached. Sleeping for: 464
Rate limit reached. Sleeping for: 867
Rate limit reached. Sleeping for: 868


Downloaded 10000 tweets


In [6]:
#read foxnews_cnn file into a df
news_df = pd.read_csv('FoxNews_CNN.csv')
news_df

Unnamed: 0,ID_String,Text
0,1071924021799866370,@TessatTys @davidaxelrod @ggreenwald @RahmEman...
1,1071924017911791624,@realDonaldTrump @TheRickWilson @CNN @TheRickW...
2,1071924010211000320,@DeeBlog @Dennis2Clive @CNN Not so- we like hi...
3,1071924009388797953,@BoSnerdley @CNNPolitics There is a wealth of ...
4,1071924007224688640,@AppleWithAFace @paperbackwrit3r @HuffPost @Fo...
5,1071924003042930690,@DIVINE_VISUALS @CNN It's all right. Everybody...
6,1071923989256245248,@CNN There is no fact in this statement. It is...
7,1071923988732018689,@AlsoUnicat @CNN 2. Priebus and Kelly. But Tru...
8,1071923985137360898,@AndrewCMcCarthy @FoxNews @POTUS @foxandfriend...
9,1071923972017668097,@dennismm59 @HeatherChilders @TomFitton @Judic...


In [7]:
#Labels each tweet FoxNews, CNN, or nan
def news_org(row):
    #nan if tweet mentions both foxnews and cnn
    if '@FoxNews' in row['Text'] and '@CNN' in row['Text']:
        return np.nan
    elif '@FoxNews' in row["Text"] :
        return 'FoxNews'
    elif '@CNN' in row["Text"]:
        return 'CNN'
    else:
        return np.nan

In [8]:
#Adds a new column for the labels
news_df['News_Org'] = news_df.apply(news_org, axis=1)

In [9]:
#Shows first 10 rows
display(news_df.head(10))

Unnamed: 0,ID_String,Text,News_Org
0,1071924021799866370,@TessatTys @davidaxelrod @ggreenwald @RahmEman...,CNN
1,1071924017911791624,@realDonaldTrump @TheRickWilson @CNN @TheRickW...,CNN
2,1071924010211000320,@DeeBlog @Dennis2Clive @CNN Not so- we like hi...,CNN
3,1071924009388797953,@BoSnerdley @CNNPolitics There is a wealth of ...,CNN
4,1071924007224688640,@AppleWithAFace @paperbackwrit3r @HuffPost @Fo...,FoxNews
5,1071924003042930690,@DIVINE_VISUALS @CNN It's all right. Everybody...,CNN
6,1071923989256245248,@CNN There is no fact in this statement. It is...,CNN
7,1071923988732018689,@AlsoUnicat @CNN 2. Priebus and Kelly. But Tru...,CNN
8,1071923985137360898,@AndrewCMcCarthy @FoxNews @POTUS @foxandfriend...,FoxNews
9,1071923972017668097,@dennismm59 @HeatherChilders @TomFitton @Judic...,FoxNews


In [10]:
#Decode html to regular text
def html_decoding(text):
    return html.unescape(text)

#remove @mentions
def remove_mentions(text):
    #re.sub function is used to replace substring. The pattern is re.sub(pattern,repl,string).
    #The prefix 'r' lets python know that the expression is a raw string so escape sequences are not parsed.
    #The pattern we are looking for is any string that begins with an @
    #"S" matches any non-whitespace character
    #The plus sign is used as repition operator allowing the preceding token, "any non-whitespace character", 
        #to repeat one or more times
    #"S+" combined will match all non-whitespace characters (the end of the mention)
    #Will replace pattern with an empty string, ''
    #The string is the text used as the paramater for this function.
    return re.sub(r"@\S+", '', text)

#remove url links
def remove_urls(text):
    #The pattern we are looking for is any string that begins with http
    return re.sub(r"http\S+", '', text)

#remove non letters
def remove_nonletters(text):
    #The pattern we are looking for is any character this is not a letter
    #"^" matches any character that is not in the character class of what follows it,in this case, "a-zA-Z"
    return re.sub("[^a-zA-Z]", " ", text)

#cleans the tweet
def clean_tweet(text):
    decoded = html_decoding(text)
    no_mentions = remove_mentions(decoded)
    no_urls = remove_urls(no_mentions)
    only_letters = remove_nonletters(no_urls)
    no_extra_spaces = ' '.join(only_letters.split())
    return no_extra_spaces
    


In [11]:
#Runs sentiment analysis for each tweet
#Categorizes by polarity
def sentiment_analysis(text):
    analysis = TextBlob(clean_tweet(text))
    if analysis.sentiment.polarity > 0:
        return 1
    elif analysis.sentiment.polarity == 0:
        return 0
    else:
        return -1

In [12]:
#Add polarity column
news_df['SA'] = np.array([sentiment_analysis(text) for text in news_df['Text']])

In [13]:
display(news_df.head(10))

Unnamed: 0,ID_String,Text,News_Org,SA
0,1071924021799866370,@TessatTys @davidaxelrod @ggreenwald @RahmEman...,CNN,1
1,1071924017911791624,@realDonaldTrump @TheRickWilson @CNN @TheRickW...,CNN,1
2,1071924010211000320,@DeeBlog @Dennis2Clive @CNN Not so- we like hi...,CNN,1
3,1071924009388797953,@BoSnerdley @CNNPolitics There is a wealth of ...,CNN,0
4,1071924007224688640,@AppleWithAFace @paperbackwrit3r @HuffPost @Fo...,FoxNews,1
5,1071924003042930690,@DIVINE_VISUALS @CNN It's all right. Everybody...,CNN,1
6,1071923989256245248,@CNN There is no fact in this statement. It is...,CNN,-1
7,1071923988732018689,@AlsoUnicat @CNN 2. Priebus and Kelly. But Tru...,CNN,0
8,1071923985137360898,@AndrewCMcCarthy @FoxNews @POTUS @foxandfriend...,FoxNews,0
9,1071923972017668097,@dennismm59 @HeatherChilders @TomFitton @Judic...,FoxNews,0


In [14]:
#Replace NaNs with an empty string
news_df.fillna("", inplace=True)
display(news_df.head(10))

Unnamed: 0,ID_String,Text,News_Org,SA
0,1071924021799866370,@TessatTys @davidaxelrod @ggreenwald @RahmEman...,CNN,1
1,1071924017911791624,@realDonaldTrump @TheRickWilson @CNN @TheRickW...,CNN,1
2,1071924010211000320,@DeeBlog @Dennis2Clive @CNN Not so- we like hi...,CNN,1
3,1071924009388797953,@BoSnerdley @CNNPolitics There is a wealth of ...,CNN,0
4,1071924007224688640,@AppleWithAFace @paperbackwrit3r @HuffPost @Fo...,FoxNews,1
5,1071924003042930690,@DIVINE_VISUALS @CNN It's all right. Everybody...,CNN,1
6,1071923989256245248,@CNN There is no fact in this statement. It is...,CNN,-1
7,1071923988732018689,@AlsoUnicat @CNN 2. Priebus and Kelly. But Tru...,CNN,0
8,1071923985137360898,@AndrewCMcCarthy @FoxNews @POTUS @foxandfriend...,FoxNews,0
9,1071923972017668097,@dennismm59 @HeatherChilders @TomFitton @Judic...,FoxNews,0


In [15]:
# Save the data frame to a .csv file
# Specifying index=False prevents us from writing a column of row numbers

news_df.to_csv("FoxNews_CNN_clean.csv", index=False)