In [3]:
import nltk

In [4]:
import pandas as pd

In [5]:
messages_neg = [line.rstrip() for line in open('data/neg.txt')]

In [6]:
messages_pos = [line.rstrip() for line in open('data/pos.txt')]

In [7]:
print(len(messages_neg))

276512


In [8]:
#generate dataframes for pos and neg

In [9]:
messages_neg = pd.DataFrame({'label': 'negative', 'message': messages_neg})

In [10]:
messages_pos = pd.DataFrame({'label': 'positive', 'message': messages_pos})

In [11]:
messages_neg = messages_neg.drop(messages_neg[messages_neg['message'] == 'na'].index)

In [12]:
messages_pos = messages_pos.drop(messages_pos[messages_pos['message'] == 'na'].index)

In [13]:
import string

In [14]:
messages = messages_pos.append(messages_neg)

In [15]:
messages.head()

Unnamed: 0,label,message
0,positive,东西很好哦
1,positive,没有描述
2,positive,可穿在钥匙扣随身携带。
3,positive,文曲星 E638过级王 内置剑桥高阶双解词典 （白色) good
4,positive,看上去非常坚固。


In [16]:
import jieba

In [17]:
from nltk.corpus import stopwords

In [18]:
ch_stopwords = stopwords.words('chinese')

In [20]:
def split_words(words):
    return " ".join(jieba.cut(words)).split()

In [21]:
# split words for message column

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
bow_transformer = CountVectorizer(analyzer=split_words).fit(messages['message'])

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/9d/mbkk7ws15wq3mmm9t5b52spc0000gn/T/jieba.cache
Loading model cost 0.640 seconds.
Prefix dict has been built succesfully.


In [24]:
print(len(bow_transformer.vocabulary_))

43635


In [25]:
message4 = messages['message'][3]
print(message4)

文曲星 E638过级王 内置剑桥高阶双解词典 （白色) good


In [26]:
bow4 = bow_transformer.transform([message4])
print(bow4)
print(bow4.shape)

  (0, 16)	1
  (0, 2069)	1
  (0, 3447)	1
  (0, 10138)	1
  (0, 11577)	1
  (0, 12965)	1
  (0, 24829)	1
  (0, 30898)	1
  (0, 31908)	1
  (0, 37775)	1
  (0, 39610)	1
  (0, 43280)	1
  (0, 43589)	1
(1, 43635)


In [27]:
print(bow_transformer.get_feature_names()[16])
print(bow_transformer.get_feature_names()[2069])

)
E638


In [28]:
messages_bow = bow_transformer.transform(messages['message'])

In [29]:
print('Shape of Sparse Matrix: ', messages_bow.shape)
print('Amount of Non-Zero occurences: ', messages_bow.nnz)

Shape of Sparse Matrix:  (313512, 43635)
Amount of Non-Zero occurences:  2044595


In [30]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(messages_bow)
tfidf4 = tfidf_transformer.transform(bow4)
print(tfidf4)

  (0, 43589)	0.183617385928
  (0, 43280)	0.333655645291
  (0, 39610)	0.304464065732
  (0, 37775)	0.254955891021
  (0, 31908)	0.209163350472
  (0, 30898)	0.301664497671
  (0, 24829)	0.272472918113
  (0, 12965)	0.326011552655
  (0, 11577)	0.333655645291
  (0, 10138)	0.251928241553
  (0, 3447)	0.239696354077
  (0, 2069)	0.320082333768
  (0, 16)	0.217441765268


In [31]:
print(tfidf_transformer.idf_[bow_transformer.vocabulary_['文曲星']])

10.2543987238


In [32]:
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

(313512, 43635)


In [33]:
from sklearn.naive_bayes import MultinomialNB
spam_detect_model = MultinomialNB().fit(messages_tfidf, messages['label'])

In [34]:
print('predicted:', spam_detect_model.predict(tfidf4)[0])
print('expected:', messages.label[3])

predicted: positive
expected: positive


In [35]:
all_predictions = spam_detect_model.predict(messages_tfidf)
print(all_predictions)

['positive' 'positive' 'positive' ..., 'negative' 'negative' 'negative']


In [36]:
from sklearn.metrics import classification_report
print (classification_report(messages['label'], all_predictions))

             precision    recall  f1-score   support

   negative       0.90      0.93      0.92    200000
   positive       0.88      0.82      0.85    113512

avg / total       0.89      0.89      0.89    313512



In [37]:
from sklearn.model_selection import train_test_split

In [38]:
msg_train, msg_test, label_train, label_test = \
train_test_split(messages['message'], messages['label'], test_size=0.2)

In [41]:
print(len(msg_train), len(msg_test), len(msg_train) + len(msg_test))

250809 62703 313512


In [43]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('bow', CountVectorizer(analyzer=split_words)),  # strings to token integer counts
    ('tfidf', TfidfTransformer()),  # integer counts to weighted TF-IDF scores
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

In [44]:
pipeline.fit(msg_train,label_train)

Pipeline(memory=None,
     steps=[('bow', CountVectorizer(analyzer=<function split_words at 0x1a16b19488>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None,...f=False, use_idf=True)), ('classifier', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [45]:
predictions = pipeline.predict(msg_test)

In [46]:
print(classification_report(predictions,label_test))

             precision    recall  f1-score   support

   negative       0.93      0.89      0.91     41848
   positive       0.80      0.87      0.84     20855

avg / total       0.89      0.89      0.89     62703

