In [3]:
import tensorflow as tf
import re
from pprint import pprint
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.neural_network import MLPClassifier
import pickle
from os import listdir
from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer

import matplotlib.pyplot as plot

tf.logging.set_verbosity(tf.logging.WARN)

In [4]:
stemmer = EnglishStemmer()
stop_words = stopwords.words('english')
my_stop_words='to and http https com co www'
stop_words=stop_words+my_stop_words.split()
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### Preprocess

In [5]:
def preprocess(text):
    text=[tweet.lower() for tweet in text if tweet.count('#')<=3]
    text=[re.sub('[^ ]+\.[^ ]+','',tweet) for tweet in text]
#     text=[re.sub('#[^ ]+','',tweet) for tweet in text]
    text=[re.sub('[^a-zA-Z0-9.!? ]',' ',(tweet)) for tweet in text]
    text=[' '.join([word for word in tweet.strip().split() if word not in stop_words]) for tweet in text]
    text=[stemmer.stem(tweet.strip()) for tweet in text]
#     return list(set(text))
    return text

### Load Good and Bad

In [7]:
# x_good=pd.Index(preprocess(pd.read_csv('dataset/csv/good_bad/good.csv').values.ravel()))
# x_bad=pd.Index(preprocess(pd.read_csv('dataset/csv/good_bad/bad.csv').values.ravel()))
x_good=pd.Index(preprocess(pd.read_csv('dataset/csv/good_bad/final_dset/good.csv').values.ravel()))
x_bad=pd.Index(preprocess(pd.read_csv('dataset/csv/good_bad/final_dset/bad.csv').values.ravel()))
x_onews=pd.Index(preprocess(pd.read_csv('dataset/csv/good_bad/final_dset/other_news.csv').values.ravel()))
x_spam=pd.Index(preprocess(pd.read_csv('dataset/csv/good_bad/final_dset/spam.csv').values.ravel()))
x_less_good=pd.Index(preprocess(pd.read_csv('dataset/csv/good_bad/final_dset/less_good.csv').values.ravel()))
x_less_bad=pd.Index(preprocess(pd.read_csv('dataset/csv/good_bad/final_dset/less_bad.csv').values.ravel()))


y_good = np.concatenate((np.ones((x_good.shape[0],1)),np.zeros((x_bad.shape[0]+x_onews.shape[0]+x_spam.shape[0]+x_less_good.shape[0]+x_less_bad.shape[0],1))),axis=0)
y_bad = np.concatenate((np.zeros((x_good.shape[0],1)),np.ones((x_bad.shape[0],1)),np.zeros((x_onews.shape[0],1)),np.zeros((x_spam.shape[0],1)),np.zeros((x_less_good.shape[0],1)),np.zeros((x_less_bad.shape[0],1))),axis=0)
y_onews = np.concatenate((np.zeros((x_good.shape[0],1)),np.zeros((x_bad.shape[0],1)),np.ones((x_onews.shape[0],1)),np.zeros((x_spam.shape[0],1)),np.zeros((x_less_good.shape[0],1)),np.zeros((x_less_bad.shape[0],1))),axis=0)
y_spam = np.concatenate((np.zeros((x_good.shape[0],1)),np.zeros((x_bad.shape[0],1)),np.zeros((x_onews.shape[0],1)),np.ones((x_spam.shape[0],1)),np.zeros((x_less_good.shape[0],1)),np.zeros((x_less_bad.shape[0],1))),axis=0)
y_less_good = np.concatenate((np.zeros((x_good.shape[0],1)),np.zeros((x_bad.shape[0],1)),np.zeros((x_onews.shape[0],1)),np.zeros((x_spam.shape[0],1)),np.ones((x_less_good.shape[0],1)),np.zeros((x_less_bad.shape[0],1))),axis=0)
y_less_bad = np.concatenate((np.zeros((x_good.shape[0],1)),np.zeros((x_bad.shape[0],1)),np.zeros((x_onews.shape[0],1)),np.zeros((x_spam.shape[0],1)),np.zeros((x_less_good.shape[0],1)),np.ones((x_less_bad.shape[0],1))),axis=0)


In [8]:
x_test=np.concatenate((x_good,x_bad,x_onews,x_spam,x_less_good,x_less_bad),axis=0)
x_train=np.concatenate((x_good,x_bad,x_onews,x_spam,x_less_good,x_less_bad),axis=0)

y_test=np.concatenate((y_good,y_bad,y_onews,y_spam,y_less_good,y_less_bad),axis=1)
y_train=np.concatenate((y_good,y_bad,y_onews,y_spam,y_less_good,y_less_bad),axis=1)

### Classifier

In [9]:
# svd = TruncatedSVD(algorithm='randomized', n_components=1, n_iter=1, random_state=42, tol=0.0)

nn=MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(50,50), random_state=1)
# classifier = make_pipeline(CountVectorizer(),TfidfTransformer(),svd,Normalizer(copy=False),nn)
classifier = make_pipeline(CountVectorizer(ngram_range=(1, 3)),TfidfTransformer(),nn)

In [10]:
%time classifier.fit(x_train, y_train)

CPU times: user 1min 15s, sys: 6.02 s, total: 1min 21s
Wall time: 1min 20s


Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 3), preprocessor=None, stop_words=None,
  ...      solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False))])

### Prediction

In [11]:
predicted = classifier.predict_proba(x_test)
# predicted = predicted.reshape(predicted.shape[0],2)
pred_index = np.argmax(predicted,axis=1).reshape(x_test.shape[0],1)
# pred_index = np.argmax(predicted,axis=1).reshape(x_test.shape[0],1)

np.concatenate((predicted,pred_index),axis=1)

array([[9.99999030e-01, 2.38830493e-05, 1.41422119e-06, ...,
        5.76127049e-11, 4.14659322e-06, 0.00000000e+00],
       [9.99996878e-01, 6.54333688e-04, 4.76365625e-08, ...,
        4.14518789e-12, 2.96299192e-07, 0.00000000e+00],
       [9.99999826e-01, 1.44033370e-06, 2.08139065e-08, ...,
        3.38862547e-08, 3.86581272e-06, 0.00000000e+00],
       ...,
       [6.84446278e-06, 3.47827920e-07, 2.46715731e-04, ...,
        2.13390821e-09, 9.99939700e-01, 5.00000000e+00],
       [7.59386084e-05, 4.44864627e-01, 1.16670116e-07, ...,
        4.57940920e-09, 4.98509206e-01, 5.00000000e+00],
       [4.34697514e-06, 7.02188824e-06, 3.60657252e-06, ...,
        6.58428729e-11, 9.99966939e-01, 5.00000000e+00]])

In [12]:
np.mean(pred_index == np.argmax(y_test,axis=1).reshape(x_test.shape[0],1))  

0.9881535583860337

In [13]:
print(np.hstack((np.argmax(y_test,axis=1).reshape(x_test.shape[0],1),pred_index)))

[[0 0]
 [0 0]
 [0 0]
 ...
 [5 5]
 [5 5]
 [5 5]]


### Confusion Matrix

In [14]:
confusion_matrix(np.argmax(y_test,axis=1).reshape(x_test.shape[0],1),pred_index)
#
#   true positive  |  false positive
# _________________|________________
#  false negative  | true negative
#                  |

array([[1490,    1,    3,    7,    1,    0],
       [   3, 1419,    3,    7,    0,   15],
       [   0,    3, 1716,   20,    6,    1],
       [   5,    3,   15, 4061,    3,    5],
       [   2,    1,    8,    9, 1415,    0],
       [   2,    5,    0,    5,    0,  993]])

### Test

In [15]:
new_text=["""Coinbase Hires Silicon Valley Dealmaker as Firm Explores  Potential Acquisitions http://www.google.com/ - CCN: Bitcoin, Ethereum, ICO, Blockchain & Cryptocurrency News http://dlvr.it/QJwjKW pic.twitter.com/T8JLIwUsew"""]
new_text=preprocess(new_text)
new_text

['coinbase hires silicon valley dealmaker firm explores potential acquisitions ccn bitcoin ethereum ico blockchain cryptocurrency new']

In [16]:
classifier.predict_proba(new_text)

array([[9.14341929e-07, 2.70664534e-08, 7.09825593e-05, 4.85531802e-04,
        1.78769039e-01, 6.12167174e-06]])

### Save Trained Model

In [17]:
filename = 'saved_classifier/good_bad_classifier.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [18]:
new_text=["""Coinbase    #we   #hehe        SEC                   Hires Silicon Valley Dealmaker as Firm Explores Potential Acquisitions - CCN: Bitcoin, Ethereum, ICO, Blockchain & Cryptocurrency News http://dlvr.it/QJwjKW pic.twitter.com/T8JLIwUsew"""]
new_text=preprocess(new_text)
print(new_text)

['coinbase hehe sec hires silicon valley dealmaker firm explores potential acquisitions ccn bitcoin ethereum ico blockchain cryptocurrency new']


In [None]:
[{'text':'CBOE Nudges SEC to Allow Bitcoin ETFs in New Letter','timestamp':'Tue Mar 27 13:39:58 +0000 2018','index':0}]

### Rough

In [115]:
a=['see sees sees run runs runs saws']
[stemmer.stem(w) for w in a]

['see sees sees run runs runs saw']

In [11]:
np.zeros((1,2))

array([[0., 0.]])

In [497]:
from datetime import datetime

In [502]:
datetime.strptime('Tue Mar 27 13:39:58 +0000 2018','%a %b %d %H:%M:%S +0000 %Y')

datetime.datetime(2018, 3, 27, 13, 39, 58)