In [None]:
'''Notebook to demonstrate basic feature extraction steps'''

In [1]:
import pandas as pd

df = pd.read_csv('sample_tweets.csv', encoding='latin-1')

In [2]:
# Remove mentions: might constitute noisy data
# since mentions are relatively unique
mention = r'^(.*\s)?@\w+'
df.text = df.text.str.replace(mention, '')

In [4]:
df.text.head()

0     http://twitpic.com/2y1zl - Awww, that's a bum...
1    is upset that he can't update his Facebook by ...
2     I dived many times for the ball. Managed to s...
3      my whole body feels itchy and like its on fire 
4     no, it's not behaving at all. i'm mad. why am...
Name: text, dtype: object

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
CountVectorizer()

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [12]:
word_vectorizer = CountVectorizer(ngram_range=(1, 3))
s = 'We are really cool and awesome and boys suck'
word_vectorizer.fit_transform([s])
word_vectorizer.vocabulary_

{'and': 0,
 'and awesome': 1,
 'and awesome and': 2,
 'and boys': 3,
 'and boys suck': 4,
 'are': 5,
 'are really': 6,
 'are really cool': 7,
 'awesome': 8,
 'awesome and': 9,
 'awesome and boys': 10,
 'boys': 11,
 'boys suck': 12,
 'cool': 13,
 'cool and': 14,
 'cool and awesome': 15,
 'really': 16,
 'really cool': 17,
 'really cool and': 18,
 'suck': 19,
 'we': 20,
 'we are': 21,
 'we are really': 22}

In [13]:
from stop_words import get_stop_words
stop_words = get_stop_words('en')

In [15]:
stop_words.remove('not')

In [16]:
word_vectorizer = CountVectorizer(ngram_range=(1,3), stop_words=stop_words)

In [18]:
target = df.sentiment
tweets = df.text

In [20]:
word_doc_matrix = word_vectorizer.fit_transform(tweets)

In [21]:
word_doc_matrix

<9999x122701 sparse matrix of type '<class 'numpy.int64'>'
	with 194060 stored elements in Compressed Sparse Row format>

In [22]:
word_vectorizer.vocabulary_

{'thanks enjoying time': 103285,
 'iphone 3bar gprs': 52549,
 'gear wana': 38313,
 'stack pointer': 97840,
 'love pictures': 62828,
 'sklov': 94551,
 'thinking earlier': 104445,
 'annoying weights 20': 5734,
 'finish tomorrow yesssss': 34899,
 'just pure junk': 54886,
 'dragons': 28040,
 'flex sample code': 35434,
 'topshop bc': 107729,
 'pospect 84': 81969,
 'trash can': 108375,
 'seriously good best': 92147,
 'album september': 3996,
 'lbs amp': 57900,
 'spamming people tickets': 97183,
 'regret not mercy': 87119,
 'monster loca mocha': 68571,
 'gee insomnia sucks': 38329,
 'night throwing': 72952,
 'bloggers now': 12469,
 'morning tired lol': 69119,
 '70000': 1888,
 'tweetdeck http': 109642,
 'encouraging': 30116,
 'editing software timelines': 29471,
 'wouldnt go far': 120271,
 'breakfast banana': 13718,
 'last half hour': 57259,
 'night japan': 72827,
 'redo sleeping patterns': 86971,
 'lollypop': 61474,
 'blogs don forget': 12483,
 'wnt able': 118400,
 'near future none': 70955,


In [25]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer(use_idf=True)
word_doc_tfidf_matrix = tfidf_transformer.fit_transform(word_doc_matrix)

In [26]:
word_doc_tfidf_matrix

<9999x122701 sparse matrix of type '<class 'numpy.float64'>'
	with 194060 stored elements in Compressed Sparse Row format>

In [48]:
from sklearn.naive_bayes import GaussianNB

subset = df.sample(n=1000, random_state=1234)

word_vectorizer = CountVectorizer(ngram_range=(1,3), stop_words=stop_words)
tfidf_transformer = TfidfTransformer(use_idf=True)

word_doc_matrix = word_vectorizer.fit_transform(subset.text)
features = tfidf_transformer.fit_transform(word_doc_matrix)

model = GaussianNB()

# Train the model 
classifier = model.fit(features.toarray(), subset.sentiment)

In [49]:
def predict_sentiment(new_tweets):
    counts = word_vectorizer.transform(new_tweets)
    tfidfs = tfidf_transformer.transform(counts)
    predictions = classifier.predict(tfidfs.toarray())
    return predictions

In [50]:
predict_sentiment(['i hate life'])

array([0])

In [51]:
predict_sentiment(['i am happy!!!'])

array([4])

In [52]:
predict_sentiment(['i like cats', 'i like tea', 'the weather is terrible', 'i do not want to go to work', 'life is wonderful'])

array([4, 4, 0, 0, 4])