#Imports and Installs

In [None]:
pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.27.3-py3-none-any.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m48.1 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.3-py3-none-any.whl (199 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.8/199.8 KB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.3 tokenizers-0.13.2 transformers-4.27.3


In [None]:
import pandas as pd
from transformers import DistilBertTokenizerFast
import tensorflow as tf
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
from sklearn.model_selection import train_test_split
import os
import math
import json
import chardet
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Training

In [None]:
# SETTING UP TOKENIZER
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

# LOADING DATA
input_data = pd.read_csv('/content/drive/MyDrive/data_khaled.csv')
data = input_data[['Tweet Text','Informativeness']].copy()
print(int(3*len(data['Informativeness'].tolist())/4))

# Splitting the data into train and test sets
data = data[0:int(len(data['Informativeness'].tolist())/2)]
data.loc[data['Informativeness'] == 'informative', 'Informativeness'] = 1
data.loc[data['Informativeness'] == 'not_informative', 'Informativeness'] = 0

# SPLITTING DATA INTO TRAINING, VALIDATION AND TESTING
# Train: 60%, Validation: 20%, Test: 20%
xtrain, xtest, ytrain, ytest = train_test_split(data['Tweet Text'], data['Informativeness'], test_size=0.2) 
xtrain, xval, ytrain, yval = train_test_split(xtrain, ytrain, test_size=0.25)

# ENCODING DATA
train_encodings = tokenizer(xtrain.tolist(), truncation=True, padding=True)
val_encodings = tokenizer(xval.tolist(), truncation=True, padding=True)
test_encodings = tokenizer(xtest.tolist(), truncation=True, padding=True)

# LOADING DATA INTO HUGGINGFACE DATASET OBJECT
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), ytrain.tolist()))
val_dataset = tf.data.Dataset.from_tensor_slices((dict(val_encodings), yval.tolist()))
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), ytest.tolist()))

# TRAINING MODEL
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
model.compile(optimizer=optimizer, loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
NUM_EPOCHS = 1
BATCH_SIZE = 90
NUM_TRAIN_BATCHES = int(math.ceil(len(train_dataset) / BATCH_SIZE))
NUM_VAL_BATCHES = int(math.ceil(len(val_dataset) / BATCH_SIZE))

history = model.fit(
    train_dataset.shuffle(len(train_dataset)).batch(BATCH_SIZE),
    validation_data=val_dataset.batch(BATCH_SIZE),
    epochs=NUM_EPOCHS,
    steps_per_epoch=NUM_TRAIN_BATCHES,
    validation_steps=NUM_VAL_BATCHES
)

# TESTING MODEL
test_loss, test_acc = model.evaluate(test_dataset.batch(BATCH_SIZE))
print("Test loss:", test_loss)
print("Test accuracy:", test_acc)


125850


Some layers from the model checkpoint at distilbert-base-uncased were not used when initializing TFDistilBertForSequenceClassification: ['vocab_projector', 'vocab_layer_norm', 'vocab_transform', 'activation_13']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier', 'dropout_19', 'pre_classifier']
You should probably TRAIN this model on a down-stream task to be able to use i

Test loss: 0.137643501162529
Test accuracy: 0.9510726928710938


Saving Model

In [None]:
model.save('/content/saved_model')

#Testing

Loading Data and extracting tweets column only

In [None]:
with open('/content/tweet_dicts.json') as f:
    raw_data = f.read()

test_data = json.loads(raw_data)

input_data = pd.DataFrame(test_data)
test_tweets = input_data['text'].tolist()

Testing

In [None]:
# Encoding Data
real_data_encodings = tokenizer(test_tweets, truncation=True, padding=True)
# Converting Data to a Huggingface Dataset object
dataset = tf.data.Dataset.from_tensor_slices(dict(real_data_encodings))
#Prediction
predictions = model.predict(dataset.batch(BATCH_SIZE))
predicted_labels = np.argmax(predictions.to_tuple()[0], axis=1)



In [None]:
#creating a dataframe of the detected tweets and their labels
d = {'Tweets':test_tweets, 'Label': predicted_labels.tolist() }
results = pd.DataFrame(d)

In [None]:
results[results['Label']==1]

Unnamed: 0,Tweets,Label
9,Iran is doubling down on military support for ...,1
20,Following Donald Trump – Hungary will Move its...,1
31,Warning that energy crisis could ‘collapse’ Du...,1
45,"After fleeing war, homelessness threatens UK’s...",1
50,BREAKING NEWS FOR NO BRAINS COCK SUCKER PAK PM...,1
62,Honduran illegal immigrant who pretended to be...,1
66,Warning that energy crisis could ‘collapse’ Du...,1
68,From prison to the frontlines: Thousands of Ru...,1
71,Deputy gangs a 'cancer' within the Los Angeles...,1
74,War in Ukraine 03 03 2023a https://t.co/2UYgue...,1


In [None]:
results.iloc[108]['Tweets']

'Who’d a guessed China might be the be ones to broker peace between Ukraine &amp;Russia? \n\nProbably driven more by the drop in productivity &amp; a global recession causing sales of their export goods to drop and their economy  to slow than some other altruistic motive, but I’ll take it. https://t.co/3UqGiR2rOx'

#Saving Model to Drive

In [None]:
!cp '/content/my_model_weights.h5' /content/drive/MyDrive/Bin_Backup/my_model_weights.h5