In [None]:
!pip install tensorflow-addons
!pip install -q -U tensorflow-text
!pip install -q -U tf-models-official
!pip install -U tfds-nightly
!pip install tensorflow-text

Collecting tensorflow-addons
  Downloading tensorflow_addons-0.15.0-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 5.5 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.15.0
[K     |████████████████████████████████| 4.9 MB 5.5 MB/s 
[K     |████████████████████████████████| 1.8 MB 5.1 MB/s 
[K     |████████████████████████████████| 99 kB 7.7 MB/s 
[K     |████████████████████████████████| 43 kB 1.7 MB/s 
[K     |████████████████████████████████| 47.7 MB 78 kB/s 
[K     |████████████████████████████████| 352 kB 46.0 MB/s 
[K     |████████████████████████████████| 213 kB 52.7 MB/s 
[K     |████████████████████████████████| 90 kB 8.6 MB/s 
[K     |████████████████████████████████| 1.2 MB 55.5 MB/s 
[K     |████████████████████████████████| 596 kB 53.5 MB/s 
[?25h  Building wheel for py-cpuinfo (setup.py) ... [?25l[?25hdone
  Building wheel for seqeval (setup.p

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

Found GPU at: /device:GPU:0


# New Section

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
"""
!pip install tensorflow-addons
!pip install -q -U tensorflow-text
!pip install -q -U tf-models-official
!pip install -U tfds-nightly

"""

import os
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text  # A dependency of the preprocessing model
import tensorflow_addons as tfa # For metrics
from official.nlp import optimization

tf.get_logger().setLevel('ERROR')

"""
Configure TFHub to read checkpoints directly from TFHub's Cloud Storage buckets. This is only recommended when running TFHub models on TPU.
Without this setting TFHub would download the compressed file and extract the checkpoint locally. Attempting to load from these local files will fail with the following error:
```
InvalidArgumentError: Unimplemented: File system scheme '[local]' not implemented
```
This is because the [TPU can only read directly from Cloud Storage buckets](https://cloud.google.com/tpu/docs/troubleshooting#cannot_use_local_filesystem).
Note: This setting is automatic in Colab.
"""
os.environ["TFHUB_MODEL_LOAD_FORMAT"]="UNCOMPRESSED"


class BERT_FineTune:
    def __init__(self, handle_encoder, handle_preprocess):
      try:
        resolver = tf.distribute.cluster_resolver.TPUClusterResolver()
        tf.config.experimental_connect_to_cluster(resolver)
        tf.tpu.experimental.initialize_tpu_system(resolver)
        strategy = tf.distribute.TPUStrategy(resolver)
        print('Using TPU')
      except:
        if tf.config.list_physical_devices('GPU'):
            strategy = tf.distribute.MirroredStrategy()
            print('Using GPU')
        else:
            strategy = None
            print('Running on CPU is not recommended.')
      self.strategy = strategy
      self.handle_encoder = handle_encoder
      self.handle_preprocess = handle_preprocess

    @staticmethod
    def make_bert_preprocess_model(handle_preprocess, sentence_features, seq_length=128):
        """Returns Model mapping string features to BERT inputs.

        Args:
        sentence_features: a list with the names of string-valued features.
        seq_length: an integer that defines the sequence length of BERT inputs.

        Returns:
        A Keras Model that can be called on a list or dict of string Tensors
        (with the order or names, resp., given by sentence_features) and
        returns a dict of tensors for input to BERT.
        """

        input_segments = [
            tf.keras.layers.Input(shape=(), dtype=tf.string, name=ft)
            for ft in sentence_features]

        # Tokenize the text to word pieces.
        bert_preprocess = hub.load(handle_preprocess)
        tokenizer = hub.KerasLayer(bert_preprocess.tokenize, name='tokenizer')
        segments = [tokenizer(s) for s in input_segments]

        # Optional: Trim segments in a smart way to fit seq_length.
        # Simple cases can skip this step and let
        # the next step apply a default truncation to approximately equal lengths.
        truncated_segments = segments

        # Pack inputs. The details (start/end token ids, dict of output tensors)
        # are model-dependent, so this gets loaded from the SavedModel.
        packer = hub.KerasLayer(bert_preprocess.bert_pack_inputs,
                                arguments=dict(seq_length=seq_length),
                                name='packer')
        model_inputs = packer(truncated_segments)
        return tf.keras.Model(input_segments, model_inputs)

    @staticmethod
    def convert_dataset(df, batch_size, bert_preprocess_model, sentence_features, label='label', shuffle=False, repeat=False):
        AUTOTUNE = tf.data.AUTOTUNE
        in_memory_ds = dict()
        for feature in sentence_features:
            in_memory_ds[feature] = df[feature]
        in_memory_ds['label'] = df[label]

        dataset = tf.data.Dataset.from_tensor_slices(in_memory_ds)
        num_examples = len(in_memory_ds['label'])

        if shuffle:
            dataset = dataset.shuffle(num_examples)
        if repeat:
            dataset = dataset.repeat()
        dataset = dataset.batch(batch_size)
        dataset = dataset.map(lambda ex: (bert_preprocess_model(ex), ex['label']))
        dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
        return dataset, num_examples

    @staticmethod
    def build_classifier_model(encoder, num_classes, seed=0):
        tf.random.set_seed(seed)

        class Classifier(tf.keras.Model):
            def __init__(self, encoder, num_classes):
                super(Classifier, self).__init__(name="prediction")
                self.encoder = encoder
                self.dropout = tf.keras.layers.Dropout(0.1)
                self.dense = tf.keras.layers.Dense(num_classes)

            def call(self, preprocessed_text):
                encoder_outputs = self.encoder(preprocessed_text)
                pooled_output = encoder_outputs["pooled_output"]
                x = self.dropout(pooled_output)
                x = self.dense(x)
                return x

        model = Classifier(encoder, num_classes)
        return model


    def tune(self, train_df, val_df=None, sentence_features=['sentence'], label='label', epochs=3, batch_size=32, optimizer='sgd', seq_length=128):
        num_classes = len(train_df[label].unique())
        bert_preprocess_model = BERT_FineTune.make_bert_preprocess_model(self.handle_preprocess, sentence_features, seq_length)

        train_dataset, train_data_size = BERT_FineTune.convert_dataset(
            train_df, batch_size, bert_preprocess_model, sentence_features, label, True, True)

        steps_per_epoch = train_data_size // batch_size
        num_train_steps = steps_per_epoch * epochs
        num_warmup_steps = num_train_steps // 10
        
        if val_df is None:
            validation_dataset = None
            validation_steps = None
        else:
            validation_dataset, validation_data_size = BERT_FineTune.convert_dataset(
                val_df, batch_size, bert_preprocess_model, sentence_features, label, False, False)
            validation_steps = validation_data_size // batch_size

        
        if self.strategy is None:
            encoder = hub.KerasLayer(self.handle_encoder, trainable=True)

            # metric have to be created inside the strategy scope

            loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

            #metrics = tfa.metrics.MatthewsCorrelationCoefficient(num_classes=2)
            classifier_model = BERT_FineTune.build_classifier_model(encoder, num_classes)

            # optimizer = optimization.create_optimizer(
            #     init_lr=init_lr,
            #     num_train_steps=num_train_steps,
            #     num_warmup_steps=num_warmup_steps,
            #     optimizer_type='adamw')

            classifier_model.compile(optimizer=optimizer, loss=loss, metrics='accuracy') # metrics=[metrics]

            classifier_model.fit(
                x=train_dataset,
                validation_data=validation_dataset,
                steps_per_epoch=steps_per_epoch,
                epochs=epochs,
                validation_steps=validation_steps)
        else:
            with self.strategy.scope():
                encoder = hub.KerasLayer(self.handle_encoder, trainable=True)

                # metric have to be created inside the strategy scope

                loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

                #metrics = tfa.metrics.MatthewsCorrelationCoefficient(num_classes=2)

                classifier_model = BERT_FineTune.build_classifier_model(encoder, num_classes)

                # optimizer = optimization.create_optimizer(
                #     init_lr=init_lr,
                #     num_train_steps=num_train_steps,
                #     num_warmup_steps=num_warmup_steps,
                #     optimizer_type='adamw')

                classifier_model.compile(optimizer=optimizer, loss=loss, metrics='accuracy') # metrics=[metrics]

                classifier_model.fit(
                    x=train_dataset,
                    validation_data=validation_dataset,
                    steps_per_epoch=steps_per_epoch,
                    epochs=epochs,
                    validation_steps=validation_steps)
        return classifier_model



In [None]:
import pandas as pd
import numpy as np
import glob



data = pd.concat([pd.read_csv(csv_file,header=None,usecols=[0,1], names=['label', 'text']) for csv_file in glob.glob(
        os.path.join("/content/drive/MyDrive/DS/train/", "*.csv"))], axis=0, ignore_index=True)
    
data.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1578627 entries, 0 to 1578626
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   label   1578627 non-null  int64 
 1   text    1578627 non-null  object
dtypes: int64(1), object(1)
memory usage: 24.1+ MB


In [None]:
data.head()

Unnamed: 0,label,text
0,0,got right to the top floor and guess what some...
1,1,got roped into working on a fridaynight boo bo...
2,0,got rubik cube as a present now have to rememb...
3,0,got sad news at the shop today had to lay off ...
4,1,got sandwiches and at the park playing hoop wi...


In [None]:
msk = np.random.rand(len(data)) < 0.8

train_df = data[msk]

val_df = data[~msk]

In [None]:
val_df

Unnamed: 0,label,text
0,0,got right to the top floor and guess what some...
5,0,got saturday school tomorrow blah hate saturda...
9,1,got seasons of skins marathon yeah
11,0,got sent home from work i have pink eye
13,0,got shampoo in my eye now it s all red
...,...,...
1578600,0,fergusthedog thanks for the great suggestions ...
1578606,1,ferlishious it is naked a snail is homeless
1578610,0,fernandahgarcia quot atm quot is quot at this ...
1578621,1,fernfiddlehead l amp d is a world of it s own ...


In [None]:
train_df

Unnamed: 0,label,text
1,1,got roped into working on a fridaynight boo bo...
2,0,got rubik cube as a present now have to rememb...
3,0,got sad news at the shop today had to lay off ...
4,1,got sandwiches and at the park playing hoop wi...
6,0,got sausage links instead of sausage patties
...,...,...
1578620,1,ferncotton please play burnin up by the jonas ...
1578622,0,fernniii alot of them are moms with kids or at...
1578623,0,fernniii aww they all went to bed early
1578624,1,fernniii ok awesome if u do check it out later...


In [None]:
feature_cols = ['text']

In [None]:
# bert_finetune = BERT_FineTune(handle_encoder='/home/jupyter/BertPretained/bert_en_uncased_L-12_H-768_A-12', 
#                               handle_preprocess='/home/jupyter/BertPreprocess/bert_en_uncased_preprocess')

bert_finetune = BERT_FineTune(handle_encoder='https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3', 
                              handle_preprocess='https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3')

Using TPU


In [None]:
bert_model = bert_finetune.tune(train_df, None, feature_cols,
                   label='label', epochs=3,
                   batch_size=32, optimizer='sgd', seq_length=128)

  inputs = self._flatten_to_reference_inputs(inputs)


Epoch 1/3


  "shape. This may consume a large amount of memory." % value)


Epoch 2/3
Epoch 3/3


In [None]:
bert_preprocess_model = bert_finetune.make_bert_preprocess_model(bert_finetune.handle_preprocess,feature_cols, seq_length=128)
test_ds, _=bert_finetune.convert_dataset(val_df, 32, 
                          bert_preprocess_model, 
                          feature_cols, 
                          label='label', shuffle=False, repeat=False)

  inputs = self._flatten_to_reference_inputs(inputs)


In [None]:
with bert_finetune.strategy.scope():
  predictions1 = bert_model.predict(test_ds)


In [None]:
test_ds

<PrefetchDataset shapes: ({input_mask: (None, 128), input_type_ids: (None, 128), input_word_ids: (None, 128)}, (None,)), types: ({input_mask: tf.int32, input_type_ids: tf.int32, input_word_ids: tf.int32}, tf.int64)>

In [None]:
val_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 315850 entries, 0 to 1578625
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   label   315850 non-null  int64 
 1   text    315850 non-null  object
dtypes: int64(1), object(1)
memory usage: 7.2+ MB


In [None]:
from sklearn.metrics import f1_score,accuracy_score
import numpy as np
pred1 = np.argmax(predictions, axis=1)
real = val_df['label']
f1 = f1_score(real, pred1, average='macro')
a = accuracy_score(real, pred1)
print('f1',f1)
print('accuracy',a)

f1 0.8602113517735948
accuracy 0.8602184581288587


In [None]:
tweet_test = pd.concat([pd.read_csv(csv_file,header=None, names=['id', 'name', 'screen name','text','num_followers','num_friends','location','geo','time']) for csv_file in glob.glob(
        os.path.join("/content/drive/MyDrive/DS/test", "*.csv"))], axis=0, ignore_index=True)
    
tweet_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163656 entries, 0 to 163655
Data columns (total 9 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             163641 non-null  object
 1   name           163621 non-null  object
 2   screen name    163632 non-null  object
 3   text           163596 non-null  object
 4   num_followers  163570 non-null  object
 5   num_friends    163570 non-null  object
 6   location       163516 non-null  object
 7   geo            163534 non-null  object
 8   time           163534 non-null  object
dtypes: object(9)
memory usage: 11.2+ MB


In [None]:
tweet_test['label'] = 0
tweet_test['text'] = tweet_test['text'].fillna(' ')
tweet_test1 = tweet_test[['label','text']]

In [None]:
tweet_test1

Unnamed: 0,label,text
0,0,RT @AirdropStario: 💧 CoinSpaceships Airdrop 💧 ...
1,0,RT @AirdropStario: 💧 CoinSpaceships Airdrop 💧 ...
2,0,Good morning
3,0,RT @iamZatoshi: Bitcoin is very much about tim...
4,0,RT @AirdropStario: 💧 DECENT Coin Airdrop 💧 🏆 ...
...,...,...
163651,0,RT @LoxNetwork: Currently under development: o...
163652,0,RT @lopezgovlaw: #Bitcoin made in #Miami-Dade:...
163653,0,Promote it on @iconictraderss
163654,0,RT @TheCryptoLark: This year we got 3 countrie...


In [None]:
val_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 315850 entries, 0 to 1578625
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   label   315850 non-null  int64 
 1   text    315850 non-null  object
dtypes: int64(1), object(1)
memory usage: 7.2+ MB


In [None]:
tweet_test1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 163656 entries, 0 to 163655
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   label   163656 non-null  int64 
 1   text    163656 non-null  object
dtypes: int64(1), object(1)
memory usage: 2.5+ MB


In [None]:


bert_preprocess_model = bert_finetune.make_bert_preprocess_model(bert_finetune.handle_preprocess,feature_cols, seq_length=128)
tweet_test2, _=bert_finetune.convert_dataset(tweet_test1, 32, 
                          bert_preprocess_model, 
                          feature_cols,label='label', 
                         shuffle=False, repeat=False)

  inputs = self._flatten_to_reference_inputs(inputs)


In [None]:
tweet_test['text']

0         RT @AirdropStario: 💧 CoinSpaceships Airdrop 💧 ...
1         RT @AirdropStario: 💧 CoinSpaceships Airdrop 💧 ...
2                                              Good morning
3         RT @iamZatoshi: Bitcoin is very much about tim...
4         RT @AirdropStario: 💧 DECENT Coin Airdrop 💧  🏆 ...
                                ...                        
163651    RT @LoxNetwork: Currently under development: o...
163652    RT @lopezgovlaw: #Bitcoin made in #Miami-Dade:...
163653                        Promote it on @iconictraderss
163654    RT @TheCryptoLark: This year we got 3 countrie...
163655    RT @purewage5: If the Seahawks beat the Rams o...
Name: text, Length: 163656, dtype: object

In [None]:
with bert_finetune.strategy.scope():
  predictions = bert_model.predict(tweet_test2)
  pred = np.argmax(predictions, axis=1)

In [None]:
pred[400:450]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0])

In [None]:
tweet_test1 = tweet_test['text']
len(tweet_test1)

163656

In [None]:
len(predictions)

163656

In [None]:
type(pred)

numpy.ndarray

In [None]:
tweet_test['sentiment'] = pred

In [None]:
tweet_test.to_csv('/content/drive/MyDrive/DS/labeled_tweets.csv')

In [None]:
tweet_test

Unnamed: 0,id,name,screen name,text,num_followers,num_friends,location,geo,time,label,sentiment
0,1473523403571224579,Mohammad,Mohamma03992389,RT @AirdropStario: 💧 CoinSpaceships Airdrop 💧 ...,3,76,,,Wed Dec 22 05:18:50 +0000 2021,0,1
1,1473523404628054017,MonOntor,MonOntor3,RT @AirdropStario: 💧 CoinSpaceships Airdrop 💧 ...,1,32,Asia,,Wed Dec 22 05:18:51 +0000 2021,0,1
2,1473523405781553153,Rony Ahmed,Rony4fire,Good morning,2358,3797,,,Wed Dec 22 05:18:51 +0000 2021,0,1
3,1473523406674956289,Ikka Apriyuni,winwinyess,RT @iamZatoshi: Bitcoin is very much about tim...,37,1735,,,Wed Dec 22 05:18:51 +0000 2021,0,1
4,1473523407656390656,Md Sohel Rana,MdSohelRana1757,RT @AirdropStario: 💧 DECENT Coin Airdrop 💧 🏆 ...,25,641,,,Wed Dec 22 05:18:51 +0000 2021,0,1
...,...,...,...,...,...,...,...,...,...,...,...
163651,1473470358074572801,CoinNext88,CoinNext88,RT @LoxNetwork: Currently under development: o...,400,715,Mars,,Wed Dec 22 01:48:03 +0000 2021,0,1
163652,1473470358971981827,Miami Gives,MiamiGives,RT @lopezgovlaw: #Bitcoin made in #Miami-Dade:...,3108,732,"Miami, FL",,Wed Dec 22 01:48:03 +0000 2021,0,1
163653,1473470359546437640,Kim Taylor,KimTayl25167873,Promote it on @iconictraderss,3,6,,,Wed Dec 22 01:48:04 +0000 2021,0,1
163654,1473470361128042496,Ian Castro,IanRCastro,RT @TheCryptoLark: This year we got 3 countrie...,269,924,Neverwhere,,Wed Dec 22 01:48:04 +0000 2021,0,1
