##### This project creates a trained Neural Network capable of separating relevant tweets from irrevalent ones and save it into a dump file, which can be loaded by for live work use. For adding datasets, tweets should be added to `true.csv` as well as `everything.csv`, `false` dataset is extracted by set difference of the two.

### Filter Tweet

In [3]:
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
import re
from pprint import pprint
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
import pickle
from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer

tf.logging.set_verbosity(tf.logging.WARN)

In [4]:
stemmer = EnglishStemmer()
stop_words = stopwords.words('english')
my_stop_words='to and http https com co www'
stop_words=stop_words+my_stop_words.split()
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

### Preprocessing Data

In [5]:
def preprocess(text):
    text=[tweet for tweet in text if tweet.count('\n')<=3]
    text=[tweet for tweet in text if tweet.count('#')<=3]
    text=[re.sub('[^ ]+\.[^ ]+','',tweet) for tweet in text]
    text=[re.sub('#[^ ]+','',tweet) for tweet in text]
    text=[re.sub('[^a-zA-Z0-9 ]',' ',(tweet)) for tweet in text]
    text=[' '.join([word.lower() for word in tweet.strip().split() if word.lower() not in stop_words]) for tweet in text]
    text=[stemmer.stem(tweet.strip()) for tweet in text]
    return list(set(text))

In [6]:
def generate_false(everything,true):
    return np.array(list(set(everything)-set(true)))

In [38]:
everything = pd.Index(preprocess(pd.read_csv('dataset/csv/filter_dataset/everything.csv').values.ravel()))
x_true = pd.Index(preprocess(pd.read_csv('dataset/csv/filter_dataset/true.csv').values.ravel()))
x_false = generate_false(everything,x_true)
# x_false = x_false[0:880]
print(x_true.shape,x_false.shape,everything.shape)

y_true = np.ones((x_true.shape[0],1))
y_false = np.zeros((x_false.shape[0],1))

print(y_true.shape,y_false.shape)

(790,) (1631,) (2421,)
(790, 1) (1631, 1)


In [24]:
# test_size=10
# x_test=np.concatenate((x_true[-test_size:],x_false[-2*test_size:]),axis=0)
# x_train=np.concatenate((x_true[:-test_size],x_.lfalse[:-2*test_size]),axis=0)

# y_test=np.concatenate((y_true[-test_size:],y_false[-2*test_size:]),axis=0)
# y_train=np.concatenate((y_true[:-test_size],y_false[:-2*test_size]),axis=0)

In [46]:
x_test=np.concatenate((x_true,x_false),axis=0)
x_train=np.concatenate((x_true,x_false),axis=0)

y_test=np.concatenate((y_true,y_false),axis=0)
y_train=np.concatenate((y_true,y_false),axis=0)

### Classifier

In [47]:
svd = TruncatedSVD(algorithm='randomized', n_components=5, n_iter=7, random_state=42, tol=0.0)
clf=MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(100, 100), random_state=1)
classifier = make_pipeline( CountVectorizer(), TfidfTransformer(),svd,Normalizer(copy=False), clf)

In [48]:
%time classifier.fit(x_train, y_train)

  y = column_or_1d(y, warn=True)


CPU times: user 2.43 s, sys: 313 ms, total: 2.75 s
Wall time: 2.05 s


Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
  ...      solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False))])

### Prediction

In [49]:
predicted = classifier.predict(x_test)
predicted = predicted.reshape(predicted.shape[0],1)

In [50]:
np.mean(predicted == y_test)  

0.7959520859149112

In [51]:
print(np.hstack((y_test,predicted)))

[[1. 0.]
 [1. 0.]
 [1. 1.]
 ...
 [0. 0.]
 [0. 1.]
 [0. 0.]]


### Confusion Matrix

In [52]:
confusion_matrix(y_test,predicted)
#
#   true positive  |  false positive
# _________________|________________
#  false negative  | true negative
#                  |

array([[1470,  161],
       [ 333,  457]])

### Test

In [53]:
new_text=["""Coinbase Hires Silicon Valley Dealmaker as Firm Explores  Potential Acquisitions http://www.google.com/ - CCN: Bitcoin, Ethereum, ICO, Blockchain & Cryptocurrency News http://dlvr.it/QJwjKW pic.twitter.com/T8JLIwUsew"""]
new_text=preprocess(new_text)
new_text

['coinbase hires silicon valley dealmaker firm explores potential acquisitions ccn bitcoin ethereum ico blockchain cryptocurrency new']

In [33]:
classifier.predict(new_text)

array([1.])

#### Known flaws
Occurance of the word `to` will result in a tweet being classified as a `good tweet`. This is maybe because most of the tweets with true cases have the word `to`. 

In [34]:
classifier.predict(preprocess(["to"]))

array([0.])

In [35]:
classifier.predict(["what the fuck is this to bitcoin"])

array([0.])

### Saving the trained model

In [36]:
pickle.dump(classifier, open('saved_classifier/filter_model.sav', 'wb'))

### Load saved model

In [21]:
classifier = pickle.load(open('saved_classifier/filter_model.sav', 'rb'))

In [22]:
classifier.predict(new_text)

array([1.])

### Rough

In [23]:
a=['apple','ball','cat','dog','elephant']
c=[3,4,5,6,7]
d=pd.DataFrame({'a':a})
d['c']=c
d['a']=d['a'].apply(lambda x:x if x.count('a')>0 else '')
d[d['a']!='']
# d.iloc[[1,2]]

Unnamed: 0,a,c
0,apple,3
1,ball,4
2,cat,5
4,elephant,7
