In [16]:
from sklearn.base import TransformerMixin
from nltk.tokenize import word_tokenize 

class NLTKBOW(TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    """ 
    returns a list of dictionaries, where first dict is list of words in first tweet, and so on
    
    Key: a word
    Value: True or false depending if words was discovered
    """
    def transform(self, X):
        return [{word: True for word in word_tokenize(document)} for document in X]

In [17]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import BernoulliNB

In [18]:
import os

input_filename = os.path.join(os.path.expanduser("~"), "Data", "research", "python_tweets.json")
labels_filename = os.path.join(os.path.expanduser("~"), "Data", "research", "python_classes.json")

In [19]:
# We're interested in the tweets themselves (not the IDs)
import json

tweets = []
with open(input_filename) as inf:
    for line in inf:
        if len(line.strip()) == 0:
            continue
        tweets.append(json.loads(line)['text'])

In [20]:
with open(labels_filename) as inf:
    labels = json.load(inf)

In [21]:
""" 
Create a pipeline that has the three components 
1. The NLTKBOW transformer that has been created
2. A DictVectorizer transformer
3. A BernoulliNB classifier
"""

from sklearn.pipeline import Pipeline

pipeline = Pipeline([('bag-of-words', NLTKBOW()),
                    ('vectorizer', DictVectorizer()),
                    ('naive-bayes', BernoulliNB())])

In [22]:
import numpy as np
from sklearn.cross_validation import cross_val_score

scores = cross_val_score(pipeline, tweets, labels, scoring='f1')
print("Score: {:.3f}".format(np.mean(scores)))

Score: 0.950


In [23]:
""" 
Score: 0.642 -> 0.826
Tweets: 100
Comments: a lot of non-English tweets that could not be accurately classified
"""

' \nScore: 0.642 -> 0.826\nTweets: 100\nComments: a lot of non-English tweets that could not be accurately classified\n'

In [24]:
# What are the best features for determining if a tweet is relevant or not?

# Fit our pipeline with the tweets - creating a new model

model = pipeline.fit(tweets, labels)

In [25]:
nb = model.named_steps['naive-bayes']

In [26]:
feature_probabilities = nb.feature_log_prob_
top_features = np.argsort(-feature_probabilities[1])[:50]

In [27]:
# Map feature's indices (from prev step) to the actual values

dv = model.named_steps['vectorizer']

In [28]:
for i, feature_index in enumerate(top_features):
    print(i, dv.feature_names_[feature_index],
          np.exp(feature_probabilities[1][feature_index]))

0 : 0.820143884892
1 http 0.73381294964
2 Python 0.661870503597
3 # 0.402877697842
4 and 0.31654676259
5 with 0.309352517986
6 the 0.294964028777
7 , 0.273381294964
8 a 0.26618705036
9 to 0.258992805755
10 @ 0.244604316547
11 for 0.237410071942
12 https 0.215827338129
13 . 0.201438848921
14 python 0.201438848921
15 in 0.165467625899
16 is 0.158273381295
17 ... 0.143884892086
18 I 0.136690647482
19 How 0.129496402878
20 Automate 0.122302158273
21 Boring 0.122302158273
22 Stuff 0.122302158273
23 - 0.122302158273
24 you 0.115107913669
25 Scapy 0.107913669065
26 Build 0.107913669065
27 Stealth 0.107913669065
28 Port 0.107913669065
29 're 0.107913669065
30 As 0.107913669065
31 we 0.107913669065
32 ? 0.107913669065
33 Scanner 0.107913669065
34 The 0.0791366906475
35 ! 0.0791366906475
36 from 0.0791366906475
37 ( 0.0719424460432
38 ) 0.0719424460432
39 via 0.0647482014388
40 can 0.0647482014388
41 Learn 0.0647482014388
42 framework 0.0575539568345
43 Data 0.0575539568345
44 it 0.0575539568345

In [29]:
from sklearn.externals import joblib

output_filename = os.path.join(os.path.expanduser("~"), "Models", "twitter", "python_context.pkl")
output_filename

'/Users/lpan/Models/twitter/python_context.pkl'

In [30]:
joblib.dump(model, output_filename)

['/Users/lpan/Models/twitter/python_context.pkl',
 '/Users/lpan/Models/twitter/python_context.pkl_01.npy',
 '/Users/lpan/Models/twitter/python_context.pkl_02.npy',
 '/Users/lpan/Models/twitter/python_context.pkl_03.npy',
 '/Users/lpan/Models/twitter/python_context.pkl_04.npy',
 '/Users/lpan/Models/twitter/python_context.pkl_05.npy']