In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

In [2]:
# Load Twitter data from a CSV file
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Datasets/Twitter_Data.csv')

In [3]:
data.shape

(162980, 2)

In [4]:
# Check the first few rows of the dataset
data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [5]:
# Dataset description
data.describe()

Unnamed: 0,category
count,162973.0
mean,0.225436
std,0.781279
min,-1.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [6]:
null_values = data.isnull().sum()

In [7]:
data.isnull()

Unnamed: 0,clean_text,category
0,False,False
1,False,False
2,False,False
3,False,False
4,False,False
...,...,...
162975,False,False
162976,False,False
162977,False,False
162978,False,False


In [3]:
# Fill missing values in 'clean_text' column with a placeholder
data['clean_text'].fillna("Placeholder", inplace=True)

In [4]:
# Preprocess the data
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
X = data['clean_text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True, max_length=128, truncation=True))
y = data['category']

In [5]:
# Flatten sequences and convert them to NumPy arrays
X = np.array([np.array(xi) for xi in X])

# Convert labels to 0 or 1
y = y.apply(lambda x: 0 if x == 'Negative' else 1)
y = np.array(y.tolist())

  X = np.array([np.array(xi) for xi in X])


In [6]:
# Pad sequences to a consistent length
max_length = max(len(seq) for seq in X)
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=max_length, padding='post')

In [7]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Load pre-trained BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
# Load pre-trained BERT model for sequence classification
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

In [10]:
# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((X_train, y_train)).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices((X_test, y_test)).batch(32)

In [None]:
# Train the model
history = model.fit(train_dataset, epochs=1, validation_data=test_dataset)

  96/4075 [..............................] - ETA: 45:05:33 - loss: 0.0329 - accuracy: 0.9941

In [None]:

# Evaluate the model
y_pred = model.predict(test_dataset)
y_pred_labels = np.argmax(y_pred[0], axis=1)
accuracy = accuracy_score(y_test, y_pred_labels)

print(f"Accuracy: {accuracy}")
print(classification_report(y_test, y_pred_labels))

# Plot accuracy over epochs
plt.plot(history.history['accuracy'], label='Training Accuracy')
plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend()
plt.show()