In [1]:
'''Notebook to demonstrate basic feature extraction steps'''

'Notebook to demonstrate basic feature extraction steps'

In [4]:
# We saw during EDA how to find the top words related to sentiment
# But how can we turn these into features that can be used to train
# a model? 

In [12]:
import pandas as pd

# Read in data. Use latin-1 encoding (single-byte encoding scheme).
# utf-8 is multi-byte encoding scheme.
df = pd.read_csv('sample_tweets.csv', encoding='latin-1')

In [17]:
# Some basic cleaning: lower, remove stopwords, remove hashtags, mentions, urls

## TODO: don't remove hashtags
## TODO: maybe look at hashtags in EDA

# Lower and remove stopwords
from stop_words import get_stop_words
stop_words = set(get_stop_words('en'))
df.text = df.text.str.lower().str.split().apply(lambda words: ' '.join([w for w in words if w not in stop_words]))

In [21]:
df.text.head(10)

0    @switchfoot http://twitpic.com/2y1zl - awww, b...
1    upset update facebook texting it... might cry ...
2    @kenichan dived many times ball. managed save ...
3                     whole body feels itchy like fire
4    @nationwideclass no, behaving all. mad. here? ...
5                                 @kwesidei whole crew
6                                             need hug
7    @loltrish hey long time see! yes.. rains bit ,...
8                                      @tatiana_k nope
9                                @twittera que muera ?
Name: text, dtype: object

In [23]:
# Remove hashtags, mentions, urls
## TODO don't remove hashtags
mention = r'^(.*\s)?@\w+'
hashtag = r'^(.*\s)?#\w+'
url = r'^((http[s]?|ftp):\/)?\/?([^:\/\s]+)((\/\w+)*\/)([\w\-\.]+[^#?\s]+)(.*)?(#[\w\-]+)?$'
df.text = df.text.str.replace(mention, '').str.replace(hashtag, '').str.replace(url, '')

In [25]:
# TODO: url not working?
df.text.head(10)

0     http://twitpic.com/2y1zl - awww, bummer. shou...
1    upset update facebook texting it... might cry ...
2     dived many times ball. managed save 50% rest ...
3                     whole body feels itchy like fire
4              no, behaving all. mad. here? see there.
5                                           whole crew
6                                             need hug
7     hey long time see! yes.. rains bit ,only bit ...
8                                                 nope
9                                          que muera ?
Name: text, dtype: object

In [28]:
# We obviously want to use words as features. But what might be
# more information is using SEQUENCE of words. These are called
# n-grams. For example:

s = 'I have studied many philosophers and many cats. The wisdom of cats is infinitely superior.'

# Show complete list in presentation
unigram = ['studied', 'many', 'philosophers', 'and', 'many', 'cats']
bigrams = ['many philosophers', 'many cats', 'wisdom of', 'infinitely superior']
trigrams = ['I have studied', 'studied many philosophers', 'wisdom of cats', 'is infinitely superior']

# As n increases, the more informative the string of words becomes
# But as n increases, the less common the sequence becomes
# We want to find n > 1 that still captures information

In [29]:
# Use scikit-learn's CountVectorizer to find n-gram counts
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
# Initialize word vectorizer: we'll find n-grams with n == 1, 2, 3 
word_vectorizer = CountVectorizer(ngram_range=(1, 3), analyzer='word')

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [None]:
## Illustrate vocabulary and so on
word_vectorizer.fit_transform([s]).vocabulary_

In [35]:
# We don't have to do some of the preprocessing from earlier - can use options here
# Lowercased by default. Can also pass in a preprocessor to address our domain-specific
# cleaning.
def preprocess(word):
    return word.replace(mention, '').replace(url, '')
    
word_vectorizer = CountVectorizer(ngram_range=(1, 3), analyzer='word', preprocessor=preprocess, stop_words=stop_words)
word_vectorizer

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3),
        preprocessor=<function preprocess at 0x104516f28>,
        stop_words={'all', 'until', "we'd", "we've", 'the', 'above', 'be', 'most', "when's", "where's", 'yourselves', 'very', 'when', 'its', "hadn't", 'for', 'have', 'on', 'was', 'each', 'would', "shan't", "there's", "didn't", "won't", 'itself', 'he', 'own', 'but', 'i', 'am', 'too', 'an', 'her', 'those', "w...th', "shouldn't", "let's", "weren't", "can't", 'so', 'or', 'myself', 'same', "haven't", 'is', 'any'},
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [40]:
# Isolate text and target
target = df.polarity
tweets = df.text

In [60]:
# Create word by document matrix
word_doc_matrix = word_vectorizer.fit_transform(tweets)

# rows: documents (tweets)
# columns: words -- all words across ALL tweets
# entry i, j: number of times word(j) occurs in document(i)
word_doc_matrix

<9999x116252 sparse matrix of type '<class 'numpy.int64'>'
	with 181840 stored elements in Compressed Sparse Row format>

In [51]:
# View the learned vocabulary of n-grams
word_vectorizer.vocabulary_

{'renting cheap': 82183,
 'well tired': 110232,
 'watched tv time': 108790,
 'melbourne': 62936,
 'tomorrow drinking coffee': 101441,
 'new student short': 68476,
 'exams killed': 29774,
 'nights rest': 69358,
 'lipstick': 56990,
 'bio hello kills': 11150,
 'birthday finished': 11239,
 'gonna start twittering': 39491,
 'now 199': 69953,
 'girl options none': 37454,
 'cant commit': 15556,
 'dead now': 23453,
 'bad spills tuesday': 9063,
 'probably cranky': 78117,
 'revisions feel': 82726,
 'today ready weekend': 101052,
 'nah people': 67009,
 'eating ana candy': 27271,
 'javelins though': 50437,
 'missing bff': 64463,
 '40 mins blew': 1362,
 'wolmyeong getting ready': 112167,
 'right thing': 83043,
 'quot came': 79383,
 'employ': 28012,
 'prayer': 77604,
 'tak faham': 95942,
 'better rec': 10802,
 'use different': 106045,
 'wonderful conversation thnx': 112258,
 'gosh made': 40351,
 'crying inside please': 21498,
 'better normal mode': 10776,
 'feel sick much': 31591,
 'freaking right n

In [52]:
# View a word frequency across the entire corpus
word_vectorizer.vocabulary_.get('pancakes')

73937

In [57]:
word_vectorizer.vocabulary_.get('feel sad')

31584

In [61]:
# What's the problem with using plain word counts?
# Let's say we have a tweet with unknown sentiment,
# and we want to classify it. If we just use overall
# word frequency, our model is going to weight words
# that are common OVERALL as being more important. But
# what if a word is important within the context of a 
# certain document? Or what if a word appears a lot in
# document A (high document frequency) but doesn't
# appear a lot overall (low term frequency)? We might
# end up misclassifying documents that are very similar
# to document A. To address this, we want to weight a
# word by how important it is to an individual document,
# and offset by how important it is overall. So words
# that are very frequent across the entire corpus have
# lower importance, which makes sense since they're less
# informative. 

# TF(t) = term frequency = How frequently does term occur in document?
#       = (number of times term t appears in a tweet) / (total number of terms in the tweet)
# IDF(t) = inverse document frequency = how important is the term overall?
#        = log(total number of tweets / number of tweets with term t in it).
# TF-IDF(t) = TF(t) * IDF(t)

# We use the word term instead of word because we might have n-grams.

## TODO: in lecture, talk about how this is used in search engines

In [70]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer(use_idf=True).fit(word_doc_matrix)
word_doc_tfidf_matrix = tfidf_transformer.fit_transform(word_doc_matrix)

In [71]:
# This is our set of features
# We will use these for training
word_doc_tfidf_matrix

<9999x116252 sparse matrix of type '<class 'numpy.float64'>'
	with 181840 stored elements in Compressed Sparse Row format>

In [72]:
tfidf_transformer.idf_

array([ 7.90775528,  9.51719319,  9.51719319, ...,  9.51719319,
        9.51719319,  9.51719319])

In [None]:
# DONE WITH FEATURE EXTRACTION!

# TODO: What did we get out of this? A set of features