In [146]:
# Labelling the class values for the twitter dataset.
import os
input_filename = os.path.join(os.path.expanduser("~"), "Data", "twitter", "python_tweets.json")
classes_filename = os.path.join(os.path.expanduser("~"), "Data", "twitter", "python_classes.json")
# probabilities_filename = os.path.join(os.path.expanduser("~"), "Data", "twitter", "replicable_dataset.json")
# print(probabilities_filename)
# with open(probabilities_filename) as inf:
#     feature_probabilities = json.load(inf)
# feature_probabilities = np.array(feature_probabilities,dtype=int)
# feature_probabilities[:,0]

In [147]:
import json
tweets = []
with open(input_filename) as inf:
    for line in inf:
        if len(line.strip()) == 0:
            continue
        tweets.append(json.loads(line)['text'])
print("Loaded {} tweets".format(len(tweets)))

Loaded 98 tweets


In [148]:
with open(classes_filename) as inf:
    labels = json.load(inf)

In [149]:
n_samples = min(len(tweets), len(labels))
n_samples

98

In [150]:
sample_tweets = [t.lower() for t in tweets[:n_samples]]
print(tweets[2:3])
labels = labels[:n_samples]
print(labels)

['研究にはfortranなんだけど、トレンド的にはpythonかなぁ。これから数年でマスターできるのか']
[0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1]


In [151]:
import numpy as np
y_true = np.array(labels)

In [152]:
print("{:.1f}% have class 1".format(np.mean(y_true == 1) * 100))

52.0% have class 1


In [153]:
from sklearn.base import TransformerMixin
from nltk import word_tokenize

class NLTKBOW(TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [{word: True for word in word_tokenize(document)}
                 for document in X]

print([{word: True for word in word_tokenize(document)}
                 for document in '研究にはfortranなんだけど、トレンド的にはpythonかなぁ。これから数年でマスターできるのか'])

[{'研': True}, {'究': True}, {'に': True}, {'は': True}, {'f': True}, {'o': True}, {'r': True}, {'t': True}, {'r': True}, {'a': True}, {'n': True}, {'な': True}, {'ん': True}, {'だ': True}, {'け': True}, {'ど': True}, {'、': True}, {'ト': True}, {'レ': True}, {'ン': True}, {'ド': True}, {'的': True}, {'に': True}, {'は': True}, {'p': True}, {'y': True}, {'t': True}, {'h': True}, {'o': True}, {'n': True}, {'か': True}, {'な': True}, {'ぁ': True}, {'。': True}, {'こ': True}, {'れ': True}, {'か': True}, {'ら': True}, {'数': True}, {'年': True}, {'で': True}, {'マ': True}, {'ス': True}, {'タ': True}, {'ー': True}, {'で': True}, {'き': True}, {'る': True}, {'の': True}, {'か': True}]


In [154]:
from sklearn.feature_extraction import DictVectorizer

In [155]:
from sklearn.naive_bayes import BernoulliNB

In [156]:
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer


In [157]:
pipeline = Pipeline([('bag-of-words', NLTKBOW()),
                     ('vectorizer', DictVectorizer()),
                     ('naive-bayes', BernoulliNB())
                     ])
scores = cross_val_score(pipeline, sample_tweets, y_true, cv=10, scoring='f1')
print("Score: {:.3f}".format(np.mean(scores)))

Score: 0.556


In [158]:
scores

array([ 0.66666667,  0.61538462,  0.5       ,  0.71428571,  0.33333333,
        0.54545455,  0.57142857,  0.44444444,  0.5       ,  0.66666667])

In [159]:
model = pipeline.fit(tweets, labels)

In [45]:
nb = model.named_steps['naive-bayes']
print(nb.feature_log_prob_)

[[-3.8918203  -1.32687094 -3.19867312 ..., -3.8918203  -3.8918203
  -3.8918203 ]
 [-2.360854   -1.57239664 -3.27714473 ..., -3.27714473 -3.27714473
  -3.27714473]]


In [56]:
top_features = np.argsort(-nb.feature_log_prob_[1])[:50]
print(top_features)

[100 459 103 260 267 548  11   1 338   9 102 101 435 240 520   0  12  10
   8   7   3 613 198 524 466 660 438 673 606 287 403 346   4 337 204 530
 117 540 110 552 257 301 559  92  90 584 588 596 528 525]


In [66]:
dv = model.named_steps['vectorizer']
for i, feature_index in enumerate(top_features):
    print(i,dv.feature_names_[feature_index],np.exp(nb.feature_log_prob_[1][feature_index]))

0 : 0.679245283019
1 https 0.603773584906
2 @ 0.528301886792
3 Python 0.283018867925
4 RT 0.245283018868
5 python 0.22641509434
6 . 0.22641509434
7 # 0.207547169811
8 a 0.207547169811
9 , 0.169811320755
10 ? 0.132075471698
11 ; 0.11320754717
12 for 0.0943396226415
13 Monty 0.0943396226415
14 of 0.0943396226415
15 ! 0.0943396226415
16 ... 0.0943396226415
17 - 0.0943396226415
18 ) 0.0943396226415
19 ( 0.0943396226415
20 & 0.0943396226415
21 to 0.0943396226415
22 I 0.0943396226415
23 on 0.0754716981132
24 in 0.0754716981132
25 with 0.0754716981132
26 from 0.0754716981132
27 … 0.0754716981132
28 the 0.0754716981132
29 Science 0.0754716981132
30 de 0.0754716981132
31 amp 0.0754716981132
32 '' 0.0754716981132
33 `` 0.0754716981132
34 Infocentrospais 0.0566037735849
35 osdalym9 0.0566037735849
36 Anyone 0.0566037735849
37 plan 0.0566037735849
38 An 0.0566037735849
39 pythonによる画像処理入門 0.0566037735849
40 Programming 0.0566037735849
41 SpecViz 0.0566037735849
42 release 0.0566037735849
43 2017 0.

In [160]:
model_filename = os.path.join(os.path.expanduser("~"), "Models", "twitter", "python_context.pkl")
from sklearn.externals import joblib
joblib.dump(model,model_filename)
#context_classifier = joblib.load(model)

['/Users/yt/Models/twitter/python_context.pkl']

In [161]:
context_classifier = joblib.load(model)

TypeError: expected str, bytes or os.PathLike object, not Pipeline