In [1]:
!pip install transformers



## Import necessary packages

In [2]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
from tokenizers import BertWordPieceTokenizer
from tqdm.notebook import tqdm
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import backend as K
import transformers
from transformers import TFAutoModel, AutoTokenizer
import matplotlib.pyplot as plt

## Configure TPU

In [3]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() 
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy()

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


Running on TPU  grpc://10.59.187.154:8470




## Get tweets training data from github 
This data was originally posted on Kaggle

In [4]:
!wget https://github.com/parthchhabra0611/tweet-data-kaggle/blob/master/train.csv?raw=true

--2020-08-25 12:52:31--  https://github.com/parthchhabra0611/tweet-data-kaggle/blob/master/train.csv?raw=true
Resolving github.com (github.com)... 140.82.112.3
Connecting to github.com (github.com)|140.82.112.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github.com/parthchhabra0611/tweet-data-kaggle/raw/master/train.csv [following]
--2020-08-25 12:52:31--  https://github.com/parthchhabra0611/tweet-data-kaggle/raw/master/train.csv
Reusing existing connection to github.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://raw.githubusercontent.com/parthchhabra0611/tweet-data-kaggle/master/train.csv [following]
--2020-08-25 12:52:32--  https://raw.githubusercontent.com/parthchhabra0611/tweet-data-kaggle/master/train.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443..

In [5]:
# load the data with pandas
train=pd.read_csv('/content/train.csv?raw=true')

The motive is to classify tweets into real disaster(target=1) and no disaster(target=0) with the help of Bert transformer

In [6]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [7]:
train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

## Removing unnecessary feature columns

In [8]:
# dropping id, location column due to large no of Nan.

train.drop(['id','location'],axis=1,inplace=True)

In [9]:
x=0  # counter for rows containing 'ablaze' keyword and target=1
y=0  # counter for total rows having keyword 'ablaze'

In [10]:
for i in range(len(train)):
    if (train['keyword'].iloc[i]=='ablaze'):
        x+=train['target'].iloc[i]
        y+=1

In [11]:
x,y

(13, 36)

This means rows containing 'ablaze' keyword and target=1 are 13 and rows containing 'ablaze' keyword and target=0 are 23. This clearly wouldn't help our data to classify the target properly. Therefore, dropping the keyword column.

In [12]:
train.drop(['keyword'],axis=1,inplace=True)

In [13]:
train['target'].value_counts()

0    4342
1    3271
Name: target, dtype: int64

## Cleaning the data
Install clean-text for cleaning the tweets which might contain urls, numbers etc. which will not be helpful for our model.

In [14]:
!pip install clean-text[gpl]

Collecting clean-text[gpl]
  Downloading https://files.pythonhosted.org/packages/e3/a4/cb7b851f1f7ae68a128482cd57ff0c4c96b64083f41ca5e9608e2a2889a5/clean_text-0.2.1-py3-none-any.whl
Collecting ftfy<6.0,>=5.8
[?25l  Downloading https://files.pythonhosted.org/packages/ff/e2/3b51c53dffb1e52d9210ebc01f1fb9f2f6eba9b3201fa971fd3946643c71/ftfy-5.8.tar.gz (64kB)
[K     |████████████████████████████████| 71kB 3.5MB/s 
[?25hCollecting unidecode<2.0.0,>=1.1.1; extra == "gpl"
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 10.7MB/s 
Building wheels for collected packages: ftfy
  Building wheel for ftfy (setup.py) ... [?25l[?25hdone
  Created wheel for ftfy: filename=ftfy-5.8-cp36-none-any.whl size=45612 sha256=630c73c43a5e52d5a503ab1ca2235d72869bff6d0c13b130642b3a18c5a635cb
  Stored in directory: /root/.cache/pip/wheels/ba

In [15]:
from cleantext import clean

In [16]:
def text_cleaning(text):
    text=clean(text,
    fix_unicode=True,               # fix various unicode errors
    to_ascii=True,                  # transliterate to closest ASCII representation
    lower=True,                     # lowercase text
    no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
    no_urls=True,                  # replace all URLs with a special token
    no_emails=True,                # replace all email addresses with a special token
    no_phone_numbers=True,         # replace all phone numbers with a special token
    no_numbers=True,               # replace all numbers with a special token
    no_digits=True,                # replace all digits with a special token
    no_currency_symbols=True,      # replace all currency symbols with a special token
    no_punct=True,                 # fully remove punctuation
    replace_with_url="<URL>",
    replace_with_email="<EMAIL>",
    replace_with_phone_number="<PHONE>",
    replace_with_number="<NUMBER>",
    replace_with_digit="0",
    replace_with_currency_symbol="<CUR>",
    lang="en"                       # set to 'de' for German special handling
    )
    return text

In [17]:
for i in range(len(train)):
    train['text'].iloc[i]=text_cleaning(train['text'].iloc[i])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer(indexer, value)


In [18]:
train['text']

0       our deeds are the reason of this earthquake ma...
1                   forest fire near la ronge sask canada
2       all residents asked to shelter in place are be...
3       <number> people receive wildfires evacuation o...
4       just got sent this photo from ruby alaska as s...
                              ...                        
7608    two giant cranes holding a bridge collapse int...
7609    ariaahrary thetawniest the out of control wild...
7610    m000 <number><number> utc0km s of volcano hawa...
7611    police investigating after an ebike collided w...
7612    the latest more homes razed by northern califo...
Name: text, Length: 7613, dtype: object

## Modeling

In [19]:
def build_model(transformer, max_len=512): 
    input_word_ids = Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    x = tf.keras.layers.Dropout(0.35)(cls_token)
    out = Dense(1, activation='sigmoid')(x)
    
    model = Model(inputs=input_word_ids, outputs=out)
    model.compile(Adam(lr=3e-5), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])
    
    return model

In [20]:
with strategy.scope():
    transformer_layer = transformers.TFBertModel.from_pretrained('bert-base-uncased')
    model = build_model(transformer_layer, max_len=512)
model.summary()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…




- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
If your task is similar to the task the model of the ckeckpoint was trained on, you can already use TFBertModel for predictions without further training.


Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_word_ids (InputLayer)  [(None, 512)]             0         
_________________________________________________________________
tf_bert_model (TFBertModel)  ((None, 512, 768), (None, 109482240 
_________________________________________________________________
tf_op_layer_strided_slice (T [(None, 768)]             0         
_________________________________________________________________
dropout_37 (Dropout)         (None, 768)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 769       
Total params: 109,483,009
Trainable params: 109,483,009
Non-trainable params: 0
_________________________________________________________________


In [21]:
import transformers
tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [22]:
save_path = 'distilbert_base_uncased/'
if not os.path.exists(save_path):
    os.makedirs(save_path)
tokenizer.save_pretrained(save_path)

('distilbert_base_uncased/vocab.txt',
 'distilbert_base_uncased/special_tokens_map.json',
 'distilbert_base_uncased/added_tokens.json')

In [23]:
from tokenizers import BertWordPieceTokenizer
fast_tokenizer = BertWordPieceTokenizer('distilbert_base_uncased/vocab.txt', lowercase=True)
fast_tokenizer

Tokenizer(vocabulary_size=30522, model=BertWordPiece, unk_token=[UNK], sep_token=[SEP], cls_token=[CLS], pad_token=[PAD], mask_token=[MASK], clean_text=True, handle_chinese_chars=True, strip_accents=None, lowercase=True, wordpieces_prefix=##)

In [28]:
def fast_encode(texts, tokenizer, size=256, maxlen=512):
    tokenizer.enable_truncation(max_length=maxlen)
    tokenizer.enable_padding(length=maxlen)
    ids_full = []
    
    for i in tqdm(range(0, len(texts), size)):
        text = texts[i:i+size].tolist()
        encs = tokenizer.encode_batch(text)
        ids_full.extend([enc.ids for enc in encs])
    
    return np.array(ids_full)

Encode the tweets using fast_tokenizer

In [29]:
x = fast_encode(train.text.astype(str), fast_tokenizer, maxlen=512)

HBox(children=(FloatProgress(value=0.0, max=30.0), HTML(value='')))




In [30]:
BATCH_SIZE=64

In [31]:
y=train['target'].values

Creating dataset for bert model

In [32]:
train_dataset = (
    tf.data.Dataset 
      .from_tensor_slices((x, y))
      .repeat()
      .shuffle(2048)
      .batch(BATCH_SIZE)
    .prefetch(tf.data.experimental.AUTOTUNE) 
)

In [33]:
with strategy.scope():
    train_history = model.fit(
      train_dataset,

      steps_per_epoch=150,

      epochs=3
    )

Epoch 1/3
Epoch 2/3
Epoch 3/3


## Doing live analysis of twitter's tweets 
We have trained the model and it can be used to classify the live tweets we gather from twitter.

Install tweepy
tweepy is the python client for the official Twitter API


In [34]:
!pip install tweepy



In [35]:
import re 
import tweepy 
from tweepy import OAuthHandler 
from cleantext import clean

Make a class for getting and analysing live tweets.

In order to fetch tweets through Twitter API, one needs to register an App through their twitter account. Follow these steps for the same:

Open this link 'https://apps.twitter.com/' and click the button: ‘Create New App’
Fill the application details. You can leave the callback url field empty.
Once the app is created, you will be redirected to the app page.
Open the ‘Keys and Access Tokens’ tab.
Copy ‘Consumer Key’, ‘Consumer Secret’, ‘Access token’ and ‘Access Token Secret’.

In [38]:
  
class TwitterClient(object): 
    
    def __init__(self): 
        
        # keys and tokens from the Twitter Dev Console 
        consumer_key = ''
        consumer_secret = ''
        access_token = ''
        access_token_secret = ''
  
        # attempt authentication 
        try: 
            # create OAuthHandler object 
            self.auth = OAuthHandler(consumer_key, consumer_secret) 
            # set access token and secret 
            self.auth.set_access_token(access_token, access_token_secret) 
            # create tweepy API object to fetch tweets 
            self.api = tweepy.API(self.auth) 
        except: 
            print("Error: Authentication Failed") 
  
    def clean_tweet(self, tweet): 
        
        tweet=clean(tweet,
        fix_unicode=True,               # fix various unicode errors
        to_ascii=True,                  # transliterate to closest ASCII representation
        lower=True,                     # lowercase text
        no_line_breaks=True,           # fully strip line breaks as opposed to only normalizing them
        no_urls=True,                  # replace all URLs with a special token
        no_emails=True,                # replace all email addresses with a special token
        no_phone_numbers=True,         # replace all phone numbers with a special token
        no_numbers=True,               # replace all numbers with a special token
        no_digits=True,                # replace all digits with a special token
        no_currency_symbols=True,      # replace all currency symbols with a special token
        no_punct=True,                 # fully remove punctuation
        replace_with_url="<URL>",
        replace_with_email="<EMAIL>",
        replace_with_phone_number="<PHONE>",
        replace_with_number="<NUMBER>",
        replace_with_digit="0",
        replace_with_currency_symbol="<CUR>",
        lang="en"                       # set to 'de' for German special handling
        )
        return tweet

    # convert tweet into tokens.    
    def convert_lines(self,tweet, max_seq_length,tokenizer):
        max_seq_length -=2
        all_tokens = []

        tokens_a = tokenizer.tokenize(tweet)
        if len(tokens_a)>max_seq_length:
            tokens_a = tokens_a[:max_seq_length]
        one_token = tokenizer.convert_tokens_to_ids(["[CLS]"]+tokens_a+["[SEP]"])+[0] * (max_seq_length - len(tokens_a))
        all_tokens.append(one_token)

        return np.array(all_tokens)
    
    def get_tweet_sentiment(self, tweet): 
        
        tweet2 = self.clean_tweet(tweet)
        
        token_input2 = self.convert_lines(tweet2,maxlen,tokenizer)
        
        seg_input2 = np.zeros((token_input2.shape[0],maxlen))
        mask_input2 = np.ones((token_input2.shape[0],maxlen))
        
        hehe = model.predict([token_input2, seg_input2, mask_input2],verbose=1,batch_size=32)
        
        if hehe <= 0.5: 
            return 'no disaster'
        else: 
            return 'real disaster'
        
  
    def get_tweets(self, query, count = 10): 
        ''' 
        Main function to fetch tweets and parse them. 
        '''
        # empty list to store parsed tweets 
        tweets = [] 
  
        try: 
            # call twitter api to fetch tweets 
            fetched_tweets = self.api.search(q = query, count = count) 
  
            # parsing tweets one by one 
            for tweet in fetched_tweets: 
                # empty dictionary to store required params of a tweet 
                parsed_tweet = {} 
  
                # saving text of tweet 
                parsed_tweet['text'] = tweet.text 
                # saving sentiment of tweet 
                parsed_tweet['class'] = self.get_tweet_sentiment(tweet.text) 
  
                # appending parsed tweet to tweets list 
                if tweet.retweet_count > 0: 
                    # if tweet has retweets, ensure that it is appended only once 
                    if parsed_tweet not in tweets: 
                        tweets.append(parsed_tweet) 
                else: 
                    tweets.append(parsed_tweet) 
  
            # return parsed tweets 
            return tweets 
  
        except tweepy.TweepError as e: 
            # print error (if any) 
            print("Error : " + str(e)) 


In [None]:
api = TwitterClient() 

# input any query and tweets regarding it would come up. 
tweets = api.get_tweets(query = 'crime', count = 200) 


ptweets = [tweet for tweet in tweets if tweet['class'] == 'real disaster']  
print("Real Disaster tweets percentage: {} %".format(100*len(ptweets)/len(tweets))) 

ntweets = [tweet for tweet in tweets if tweet['class'] == 'no disaster'] 
print("No Disaster tweets percentage: {} %".format(100*len(ntweets)/len(tweets))) 


# printing first 5 positive tweets 
print("\n\n Real Disaster tweets:") 
for tweet in ptweets[:10]: 
    print(tweet['text']) 

# printing first 5 negative tweets 
print("\n\n No Disaster tweets:") 
for tweet in ntweets[:10]: 
    print(tweet['text']) 
