Sentiment Analysis - Twitter

In [None]:
import tweepy
import json
import pandas as pd
import csv
import re

In [None]:
%run ~/twitter_credentials.py

In [None]:
#Use tweepy.OAuthHandler to create an authentication using the given key and secret
auth = tweepy.OAuthHandler(consumer_key=con_key, consumer_secret=con_secret)
auth.set_access_token(acc_token, acc_secret)
#Connect to the Twitter API using the authentication
api = tweepy.API(auth)

In [None]:
#Loop through to gather around 2k tweets and filter out retweets and replies 
num_needed = 2000
tweet_list = []
last_id = -1 # id of last tweet seen
while len(tweet_list) < num_needed:
    try:
        new_tweets = api.search(q = 'climate change AND -filter:retweets AND -filter:replies', count = 100, max_id = str(last_id - 1), tweet_mode ='extended', lang ='en')
    except tweepy.TweepError as e:
        print("Error", e)
        break
    else:
        if not new_tweets:
            print("Could not find any more tweets!")
            break
        tweet_list.extend(new_tweets)
        last_id = new_tweets[-1].id

In [None]:
len(tweet_list) 

In [None]:
#Create pandas dataframe using data points screen_name and full_text from the tweet
t = [[tweet.user.screen_name,tweet.full_text] for tweet in tweet_list]
tweet_text = pd.DataFrame(data=t, columns=['user', "text"])
tweet_text.to_csv("tweets.csv")

In [None]:
#Define a function to remove https links, special characters, punctuations, convert the text to lowercase
# I used regular expressions here
def fnc_remove_spec_characters(df,col):
    listtext =[]
    for i in range(len(df)):
        clean_text = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", df[col][i])
        listtext.append(clean_text.lower())
    return(listtext)
# After cleaning up , a new column is added to the dataframe named as cleaned_text by calling the function
tweet_text['cleaned_text'] = fnc_remove_spec_characters(tweet_text,"text")
tweet_text[1:10]

In [None]:
#Categorize keywords for people who believe in climate change vs people who deny climate change

search_words_believers =['is real','action','urgent','actonclimate','climateemergency','climateaction','urges','urge','greennewdeal','stop','policy','threat','terrifying', 'impacts', 'climateactionnow','earthday','earthdayeveryday','chaotic','extinctionrebellion','climatecatastrophe','climatechangethefacts','suffering']
search_words_nonbelievers =['is not real','hoax','misinformation','lies','fake','fakenews','refuse','suppressing','misleading','false','climatehoax', 'skeptic','denied', 'deny']

#Define a function to categorize tweets using the keywords above and create a new column named "sentiment" to be added to the dataframe
def fnc_identify_sentiment(df,col):
    sentiment_list =[]
    for i in range(len(df)):
        text = df[col][i]
        if any(word in text for word in search_words_believers):
            sentiment_list.append('Believer')
        elif any(word in text for word in search_words_nonbelievers):
            sentiment_list.append('Denier')
        else:
            sentiment_list.append('Not determinable')
    return(sentiment_list)  
tweet_text['sentiment'] = fnc_identify_sentiment(tweet_text,'cleaned_text') 

#Create 100 tweets file as a subset of the cleaned file to be submitted for the assignment - this 100 tweets file has username, cleaned_text and the sentiment column
final_tweet_df = tweet_text[['user', 'cleaned_text', 'sentiment']].copy()
#Create cleaned file with 2096 tweets to be imported into R for further analysis
final_tweet_df.to_csv("finalcleaned.csv")

#create file using to_csv
final_tweet_df[0:100].to_csv("100tweetsfile.csv")