In [None]:
import pandas as pd
import numpy as np


# Get a pandas DataFrame object of all the data in the csv file:
df = pd.read_csv('tweets.csv')

# Get pandas Series object of the "tweet text" column:
text = df['tweet_text']

# Get pandas Series object of the "emotion" column:
target = df['is_there_an_emotion_directed_at_a_brand_or_product']

# The rows of  the "emotion" column have one of three strings:
# 'Positive emotion'
# 'Negative emotion'
# 'No emotion toward brand or product'

# Remove the blank rows from the series:
target = target[pd.notnull(text)]
text = text[pd.notnull(text)]

# Perform feature extraction:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
count_vect.fit(text)
counts = count_vect.transform(text)

In [17]:
# BAD MISTAKE: DO NOT USE YOUR TRAINING DATA AS TESTING DATA!!!
#   We are still gettingonly close to 80% accuracy even though we are predicting using training data.
#   the reason we are not getting 100% is because the machine is trying to generalize

# Train with this data with a Naive Bayes classifier:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(counts, target)

# See what the classifier predicts for some new tweets:
predictions = nb.predict(counts)

correct_predictions = sum(predictions == target)
incorrect_predictions = 9092 - correct_predictions  # (there are 9,092 tweets in the csv)
print('# of correct predictions: ' + str(correct_predictions))
print('# of incorrect predictions: ' + str(incorrect_predictions))
print('Percent correct: ' + str(100.0 * correct_predictions / (correct_predictions + incorrect_predictions)))

# of correct predictions: 7229
# of incorrect predictions: 1863
Percent correct: 79.5094588649


In [18]:
# Train with this data with a Naive Bayes classifier:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()
nb.fit(counts[0:6000], target[0:6000])

# See what the classifier predicts for some new tweets:
predictions = nb.predict(counts[6000:9092])
correct_predictions = sum(predictions == target[6000:9092])
incorrect_predictions = (9092 - 6000) - correct_predictions

print('# of correct predictions: ' + str(correct_predictions))
print('# of incorrect predictions: ' + str(incorrect_predictions))
print('Percent correct: ' + str(100.0 * correct_predictions / (correct_predictions + incorrect_predictions)))

# of correct predictions: 2053
# of incorrect predictions: 1039
Percent correct: 66.3971539457


In [19]:
# Using another classifier type
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
clf.fit(counts, target)


# See what the classifier predicts for some new tweets:
predictions = clf.predict(counts[6000:9092])

correct_predictions = sum(predictions == target[6000:9092])
incorrect_predictions = (9092 - 6000) - correct_predictions
print('# of correct predictions: ' + str(correct_predictions))
print('# of incorrect predictions: ' + str(incorrect_predictions))
print('Percent correct: ' + str(100.0 * correct_predictions / (correct_predictions + incorrect_predictions)))

# of correct predictions: 2411
# of incorrect predictions: 681
Percent correct: 77.9754204398


In [7]:
# Just simply guessing
from sklearn.dummy import DummyClassifier
nb = DummyClassifier(strategy='most_frequent')

# (Tweets 0 to 5999 are used for training data)
nb.fit(counts[0:6000], target[0:6000])

# See what the classifier predicts for some new tweets:
# (Tweets 6000 to 9091 are used for testing)
predictions = nb.predict(counts[6000:9092])
correct_predictions = sum(predictions == target[6000:9092])
incorrect_predictions = (9092 - 6000) - correct_predictions
print('# of correct predictions: ' + str(correct_predictions))
print('# of incorrect predictions: ' + str(incorrect_predictions))
print('Percent correct: ' + str(100.0 * correct_predictions / (correct_predictions + incorrect_predictions)))



# of correct predictions: 1890
# of incorrect predictions: 1202
Percent correct: 61.1254851229
