##### This project makes use of saved trained model to filter relevant crypto tweets from dataset. The result is saved as `CSV`.

### Filter Tweet

In [1]:
import pickle
import pandas as pd
import numpy as np
from pprint import pprint
import re
from nltk.corpus import stopwords
from nltk.stem.snowball import EnglishStemmer

In [2]:
stemmer = EnglishStemmer()
stop_words = stopwords.words('english')
my_stop_words='to and http https com co www'
stop_words=stop_words+my_stop_words.split()
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [3]:
def preprocess(_df):
    _df['text']=_df['text'].apply(lambda tweet:str(tweet) if str(tweet).count('\n')<=3 else '')
    _df['text']=_df['text'].apply(lambda tweet:tweet if tweet.count('#')<=3 else '')
    _df['text']=_df['text'].apply(lambda tweet:re.sub('[^ ]+\.[^ ]+','',tweet))
    _df['text']=_df['text'].apply(lambda tweet:re.sub('#[^ ]+','',tweet))
    _df['text']=_df['text'].apply(lambda tweet:re.sub('[^a-zA-Z0-9 ]',' ',(tweet)))
    _df['text']=_df['text'].apply(lambda tweet:' '.join([word.lower() for word in tweet.strip().split() if word.lower() not in stop_words]))
    _df['text']=_df['text'].apply(lambda tweet:stemmer.stem(tweet.strip()))
    return _df

### Load saved model

In [4]:
classifier = pickle.load(open('saved_classifier/filter_model.sav', 'rb'))

In [5]:
new_text=["""Coinbase                                       Hires Silicon Valley Dealmaker as Firm Explores Potential Acquisitions - CCN: Bitcoin, Ethereum, ICO, Blockchain & Cryptocurrency News http://dlvr.it/QJwjKW pic.twitter.com/T8JLIwUsew"""]
new_text=preprocess(pd.DataFrame({'text':new_text}))
new_text=new_text[new_text['text']!='']
new_text

Unnamed: 0,text
0,coinbase hires silicon valley dealmaker firm e...


In [6]:
classifier.predict(new_text['text'])

array([1.])

### Load Dataset

In [7]:
%time df=pd.read_csv('dataset/csv/bitcoin_full_3.csv',encoding = 'utf8')
print(df.shape)
df.tail()

CPU times: user 1.44 s, sys: 175 ms, total: 1.62 s
Wall time: 1.63 s
(100153, 10)


Unnamed: 0,fullname,html,id,likes,replies,retweets,text,timestamp,url,user
100148,Alparslan,"<p class=""TweetTextSize js-tweet-text tweet-te...",969339185122500617,0,0.0,0.0,Hey I just received 500 GRAM Tokens as a bonus...,2018-03-01T22:30:45,/Responsense/status/969339185122500617,Responsense
100149,SunBro420,"<p class=""TweetTextSize js-tweet-text tweet-te...",969339184950534144,0,0.0,0.0,It occurred to me this tariff on steel and alu...,2018-03-01T22:30:45,/BTC_Ron/status/969339184950534144,BTC_Ron
100150,CRDZ,"<p class=""TweetTextSize js-tweet-text tweet-te...",969339178289807361,0,0.0,0.0,“Japan currently ranks as the largest #Bitcoin...,2018-03-01T22:30:44,/CRDZ_TA/status/969339178289807361,CRDZ_TA
100151,Arnaldo Queiroz R .F,"<p class=""TweetTextSize js-tweet-text tweet-te...",969339168835989504,1,0.0,0.0,http://bit.ly/2HWjosV Alemanha reconhecendo b...,2018-03-01T22:30:41,/arnekk/status/969339168835989504,arnekk
100152,Chuck Kichler,"<p class=""TweetTextSize js-tweet-text tweet-te...",969339165727973376,1,0.0,2.0,Arm Cortex-M3 processor - the core of the Inte...,2018-03-01T22:30:41,/cloudubq/status/969339165727973376,cloudubq


### Preprocess 
#### Create text list

In [8]:
p_df=preprocess(df.copy())
p_df.shape

(100153, 10)

#### Create dataframe to keep track of index

In [9]:
p_df=p_df[p_df['text']!='']
print(p_df.shape)
p_df.tail()

(80474, 10)


Unnamed: 0,fullname,html,id,likes,replies,retweets,text,timestamp,url,user
100146,Chirac ⚡️$BCH ⚡️,"<p class=""TweetTextSize js-tweet-text tweet-te...",969339190574972936,1,0.0,0.0,never ask something politely twitter least kin...,2018-03-01T22:30:47,/Panther_BTC/status/969339190574972936,Panther_BTC
100148,Alparslan,"<p class=""TweetTextSize js-tweet-text tweet-te...",969339185122500617,0,0.0,0.0,hey received 500 gram tokens bonus telegram ic...,2018-03-01T22:30:45,/Responsense/status/969339185122500617,Responsense
100149,SunBro420,"<p class=""TweetTextSize js-tweet-text tweet-te...",969339184950534144,0,0.0,0.0,occurred tariff steel aluminum seemed like tru...,2018-03-01T22:30:45,/BTC_Ron/status/969339184950534144,BTC_Ron
100150,CRDZ,"<p class=""TweetTextSize js-tweet-text tweet-te...",969339178289807361,0,0.0,0.0,japan currently ranks largest market share 61 ...,2018-03-01T22:30:44,/CRDZ_TA/status/969339178289807361,CRDZ_TA
100151,Arnaldo Queiroz R .F,"<p class=""TweetTextSize js-tweet-text tweet-te...",969339168835989504,1,0.0,0.0,alemanha reconhecendo bitcoin como meio de pag...,2018-03-01T22:30:41,/arnekk/status/969339168835989504,arnekk


### Classification

In [10]:
%time prediction=classifier.predict(p_df['text'])
probability = classifier.predict_proba(p_df['text'])
print(prediction[0:3])
print(probability[0:3])

CPU times: user 1.02 s, sys: 93.9 ms, total: 1.11 s
Wall time: 1.02 s
[1. 0. 0.]
[[2.43140526e-004 9.99756859e-001]
 [1.00000000e+000 4.65053873e-179]
 [1.00000000e+000 1.79008596e-064]]


In [11]:
good_tweet_index=[i for i,val in enumerate(prediction) if val==1]
proba_good_tweet_index=[i for i,row in enumerate(probability) if row[1]>0.99]

In [12]:
filtered_df = p_df.iloc[good_tweet_index]
proba_filtered_df = p_df.iloc[proba_good_tweet_index]
filtered_df.tail()

Unnamed: 0,fullname,html,id,likes,replies,retweets,text,timestamp,url,user
100102,Roger Ver,"<p class=""TweetTextSize js-tweet-text tweet-te...",969339323538604034,970,242.0,325.0,order bitpay card pay bitcoin cash load card d...,2018-03-01T22:31:18,/rogerkver/status/969339323538604034,rogerkver
100119,#Bitcoin #News #BTC,"<p class=""TweetTextSize js-tweet-text tweet-te...",969339280396111879,0,0.0,0.0,regional us bank latest admit cryptocurrency t...,2018-03-01T22:31:08,/1BitcoinNews/status/969339280396111879,1BitcoinNews
100126,John Jones,"<p class=""TweetTextSize js-tweet-text tweet-te...",969339262855516162,0,0.0,0.0,australians purchase btc eth across 1 200 news...,2018-03-01T22:31:04,/Testie403/status/969339262855516162,Testie403
100127,@criptomonedas365,"<p class=""TweetTextSize js-tweet-text tweet-te...",969339258719948800,0,0.0,0.0,goldman sachs sigue criticando bitcoin pero ha...,2018-03-01T22:31:03,/cripto365/status/969339258719948800,cripto365
100136,Michael Afolayan,"<p class=""TweetTextSize js-tweet-text tweet-te...",969339227367493634,0,0.0,0.0,court st petersburg strikes ban bitcoin sites ...,2018-03-01T22:30:55,/Theophelat/status/969339227367493634,Theophelat


In [13]:
actual_df = df.iloc[filtered_df.index]
proba_actual_df = df.iloc[proba_filtered_df.index]

actual_df.tail()

Unnamed: 0,fullname,html,id,likes,replies,retweets,text,timestamp,url,user
100102,Roger Ver,"<p class=""TweetTextSize js-tweet-text tweet-te...",969339323538604034,970,242.0,325.0,“You can order a @BitPay Card for $9.95 and pa...,2018-03-01T22:31:18,/rogerkver/status/969339323538604034,rogerkver
100119,#Bitcoin #News #BTC,"<p class=""TweetTextSize js-tweet-text tweet-te...",969339280396111879,0,0.0,0.0,Regional US Bank Latest to Admit Cryptocurrenc...,2018-03-01T22:31:08,/1BitcoinNews/status/969339280396111879,1BitcoinNews
100126,John Jones,"<p class=""TweetTextSize js-tweet-text tweet-te...",969339262855516162,0,0.0,0.0,"Australians Can Now Purchase BTC, ETH Across 1...",2018-03-01T22:31:04,/Testie403/status/969339262855516162,Testie403
100127,@criptomonedas365,"<p class=""TweetTextSize js-tweet-text tweet-te...",969339258719948800,0,0.0,0.0,"Goldman Sachs sigue criticando a Bitcoin, pero...",2018-03-01T22:31:03,/cripto365/status/969339258719948800,cripto365
100136,Michael Afolayan,"<p class=""TweetTextSize js-tweet-text tweet-te...",969339227367493634,0,0.0,0.0,Court in St. Petersburg Strikes Down Ban on Bi...,2018-03-01T22:30:55,/Theophelat/status/969339227367493634,Theophelat


### Save filtered data

In [14]:
actual_df.to_csv('dataset/csv/filter_dataset/filtered_dataset.csv', sep=',', index=False)
proba_actual_df.to_csv('dataset/csv/filter_dataset/proba_filtered_dataset.csv', sep=',', index=False)

In [15]:
print(actual_df.shape,proba_actual_df.shape)

(11157, 10) (10773, 10)


### Rough

In [16]:
a=['apple','ball','cat','dog','elephant']
c=[3,4,5,6,7]
d=pd.DataFrame({'a':a})
d['c']=c
d['a']=d['a'].apply(lambda x:x if x.count('a')>0 else '')
d[d['a']!='']
d.iloc[[1,2]]

Unnamed: 0,a,c
1,ball,4
2,cat,5
