In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.classify import NaiveBayesClassifier
import csv
import re
import os

Data Analysis

To analyze our data, we will be looking at the csv files containing the tweets from each state before and after the election and performing sentiment analysis on them. We will go about this process by:
1. Scraping the tweet text from the csv files
2. Cleaning tweets to remove stopwords and other unneccesary parts
3. Training a Naive Bayes classifier to classify tweets as positive or negative
4. Running the classifier on our cleaned tweets to determine average sentiment
We scrape the tweets from the csv file using scrape_tweets.

In [3]:
#scrape tweets from csv file
def scrape_tweets(filename):
    tweets = []
    with open(filename) as f:
        reader = csv.reader(f)
        next(reader) #skip header
        data = [r for r in reader]
        for item in data:
            tweets.append(item[3])
    return tweets   

Next, we want to clean our scraped tweets. We do this using clean_tweets where we remove stopwords and other non-essential parts of the tweet.

In [4]:
#clean scraped tweets
def clean_tweets(tweets):
    cleaned_tweets = []
    stop_words = set(stopwords.words('english'))
    for tweet in tweets:
        cleaned_tweet = re.sub("[^a-zA-Z]", " ", tweet).lower()
        for w in tweet.split(" "):
            if w not in stop_words:
                cleaned_tweet += w + " "
        cleaned_tweet = cleaned_tweet.strip()
        cleaned_tweets += [cleaned_tweet]
    return cleaned_tweets

Now we want to train a naive bayes classifier to tag a tweet as positive or negative. We use NLTK's NaiveBayesClassifier to do so in train_naive_bayes using training data from pos_tweet.txt and neg_tweets.txt. 

In [5]:
#train naive bayes classifier
#some of this code has been adapted from https://www.twilio.com/blog/2017/09/sentiment-analysis-python-messy-data-nltk.html
def format_sentence(s):
    return({word: True for word in nltk.word_tokenize(s)})

def train_naive_bayes():
    positive = []
    with open("./pos_tweets.txt") as f:
        for i in f: 
            positive.append([format_sentence(i), 'positive'])
    negative = []
    with open("./neg_tweets.txt") as f:
        for i in f: 
            negative.append([format_sentence(i), 'negative'])  
    training = positive + negative
    classifier = NaiveBayesClassifier.train(training)
    return classifier

Now that we have a classifer from the previous step, we can evaluate our cleaned tweets. We determine average sentiment using evaluate_sentiment where we calculate the ratio of positive tweets to total tweets.

In [6]:
#determine average sentiment accross tweets
def evaluate_sentiment(tweets, classifier):
    total_tweets = len(tweets)
    positive_tweets = 0
    for tweet in tweets:
        sentiment = classifier.classify(format_sentence(tweet))
        if sentiment == "positive":
            positive_tweets += 1
    average_happiness = positive_tweets/total_tweets
    return average_happiness

Since we have all of the individual steps hammered down now, we can put them together to perform sentiment analysis on our 100 csv files. We do this using perform_sentiment_analysis. This returns a dictionary with the average sentiment of each state before and after the election.

In [9]:
#determine average sentiment accross all states
def perform_sentiment_analysis(csv_files):
    results = dict()
    classifier = train_naive_bayes() #only need to train once
    for file in csv_files:
        tweets = scrape_tweets(file)
        cleaned_tweets = clean_tweets(tweets)
        average_happiness = evaluate_sentiment(cleaned_tweets, classifier)
        key = file.replace(".csv", "")
        results[key] = average_happiness
    return results

In [14]:
csv_files = []
for root, dirs, files in os.walk('data'):
    for file in files:
        if file.endswith('.csv'):
            csv_files.append(file)
print(csv_files)

['ak_after.csv', 'ak_before.csv', 'al_after.csv', 'al_before.csv', 'ar_after.csv', 'ar_before.csv', 'az_after.csv', 'az_before.csv', 'ca_after.csv', 'ca_before.csv', 'co_after.csv', 'co_before.csv', 'ct_after.csv', 'ct_before.csv', 'de_after.csv', 'de_before.csv', 'fl_after.csv', 'fl_before.csv', 'ga_after.csv', 'ga_before.csv', 'hi_after.csv', 'hi_before.csv', 'ia_after.csv', 'ia_before.csv', 'id_after.csv', 'id_before.csv', 'il_after.csv', 'il_before.csv', 'in_after.csv', 'in_before.csv', 'ks_after.csv', 'ks_before.csv', 'ky_after.csv', 'ky_before.csv', 'la_after.csv', 'la_before.csv', 'ma_after.csv', 'ma_before.csv', 'md_after.csv', 'md_before.csv', 'me_after.csv', 'me_before.csv', 'mi_after.csv', 'mi_before.csv', 'mn_after.csv', 'mn_before.csv', 'mo_after.csv', 'mo_before.csv', 'ms_after.csv', 'ms_before.csv', 'mt_after.csv', 'mt_before.csv', 'nc_after.csv', 'nc_before.csv', 'nd_after.csv', 'nd_before.csv', 'ne_after.csv', 'ne_before.csv', 'nh_after.csv', 'nh_before.csv', 'nj_after

In [15]:
print(perform_sentiment_analysis(csv_files))

FileNotFoundError: [Errno 2] No such file or directory: 'ak_after.csv'