In [1]:
import pandas as pd
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")

# Load the dataset
file_path = 'all-data.csv'  # Update with your file path
df = pd.read_csv(file_path, encoding='latin1')

# Rename columns for clarity
df.columns = ['Sentiment', 'Text']

# Log dataset details
logging.info(f"Dataset loaded successfully. Shape: {df.shape}")
logging.info(f"Sample data:\n{df.head()}")


2025-01-26 17:22:07,901 - INFO - Dataset loaded successfully. Shape: (4845, 2)
2025-01-26 17:22:07,915 - INFO - Sample data:
  Sentiment                                               Text
0   neutral  Technopolis plans to develop in stages an area...
1  negative  The international electronic industry company ...
2  positive  With the new production plant the company woul...
3  positive  According to the company 's updated strategy f...
4  positive  FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...


In [2]:
import re

# Define a minimal text cleaning function
def clean_text(text):
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
    return text.strip()

# Apply text cleaning
df['Cleaned_Text'] = df['Text'].apply(clean_text)
logging.info(f"Text cleaning complete. Sample cleaned data:\n{df[['Text', 'Cleaned_Text']].head()}")


2025-01-26 17:22:31,333 - INFO - Text cleaning complete. Sample cleaned data:
                                                Text  \
0  Technopolis plans to develop in stages an area...   
1  The international electronic industry company ...   
2  With the new production plant the company woul...   
3  According to the company 's updated strategy f...   
4  FINANCING OF ASPOCOMP 'S GROWTH Aspocomp is ag...   

                                        Cleaned_Text  
0  Technopolis plans to develop in stages an area...  
1  The international electronic industry company ...  
2  With the new production plant the company woul...  
3  According to the company s updated strategy fo...  
4  FINANCING OF ASPOCOMP S GROWTH Aspocomp is agg...  


In [3]:
# Map sentiments to integers
sentiment_mapping = {'positive': 2, 'neutral': 1, 'negative': 0}
df['Sentiment'] = df['Sentiment'].map(sentiment_mapping)

# Log mapping details
logging.info(f"Sentiment mapping applied: {sentiment_mapping}")


2025-01-26 17:22:43,235 - INFO - Sentiment mapping applied: {'positive': 2, 'neutral': 1, 'negative': 0}


In [4]:
from sklearn.model_selection import train_test_split

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    df['Cleaned_Text'], df['Sentiment'], test_size=0.2, random_state=42, stratify=df['Sentiment']
)

logging.info(f"Data split complete. Training samples: {len(X_train)}, Testing samples: {len(X_test)}")


2025-01-26 17:23:02,921 - INFO - Data split complete. Training samples: 3876, Testing samples: 969


In [5]:
from transformers import BertTokenizer

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenise the text data
def encode_texts(texts, tokenizer, max_len):
    return tokenizer(
        list(texts),
        max_length=max_len,
        truncation=True,
        padding='max_length',
        return_tensors='tf'
    )

# Define maximum token length
max_len = 100

# Tokenise training and testing data
train_encodings = encode_texts(X_train, tokenizer, max_len)
test_encodings = encode_texts(X_test, tokenizer, max_len)

logging.info("Tokenisation complete.")



2025-01-26 17:23:53,074 - INFO - Tokenisation complete.


In [6]:
from transformers import TFBertForSequenceClassification
import tensorflow as tf

# Load the pretrained BERT model with a classification head
model = TFBertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=3  # Three sentiment classes: Positive, Neutral, Negative
)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),  # Standard learning rate for fine-tuning
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

# Model summary
model.summary()



All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bert (TFBertMainLayer)      multiple                  109482240 
                                                                 
 dropout_37 (Dropout)        multiple                  0 (unused)
                                                                 
 classifier (Dense)          multiple                  2307      
                                                                 
Total params: 109484547 (417.65 MB)
Trainable params: 109484547 (417.65 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [7]:
# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(16)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(16)

logging.info("Data converted to TensorFlow datasets.")


2025-01-26 17:26:23,316 - INFO - Data converted to TensorFlow datasets.


In [8]:
# Train the model
logging.info("Starting model training...")
history = model.fit(
    train_dataset,
    validation_data=test_dataset,
    epochs=3,  # Fine-tuning typically needs only a few epochs
    batch_size=16
)
logging.info("Model training complete.")


2025-01-26 17:26:36,122 - INFO - Starting model training...


Epoch 1/3






Epoch 2/3
Epoch 3/3


2025-01-26 20:38:25,705 - INFO - Model training complete.


In [9]:
# Evaluate the model
logging.info("Evaluating model on test data...")
results = model.evaluate(test_dataset)
logging.info(f"BERT Test Accuracy: {results[1] * 100:.2f}%")


2025-01-26 20:38:37,969 - INFO - Evaluating model on test data...




2025-01-26 20:44:26,187 - INFO - BERT Test Accuracy: 84.52%


In [10]:
# Save the fine-tuned model
model.save_pretrained("bert_sentiment_model")
tokenizer.save_pretrained("bert_sentiment_model")
logging.info("Model and tokenizer saved to 'bert_sentiment_model' directory.")


2025-01-26 20:51:21,674 - INFO - Model and tokenizer saved to 'bert_sentiment_model' directory.


In [11]:
# Prediction function for BERT
def predict_sentiment_bert(text, tokenizer, model, max_len=100):
    # Tokenise and encode input text
    inputs = tokenizer(
        text,
        max_length=max_len,
        truncation=True,
        padding='max_length',
        return_tensors='tf'
    )
    
    # Make predictions
    outputs = model(inputs)
    logits = outputs.logits
    predicted_class = tf.argmax(logits, axis=1).numpy()[0]
    
    # Map predictions to sentiment labels
    sentiment_labels = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return sentiment_labels[predicted_class]

# Test the model
sample_texts = [
    "The company's profits have surged this quarter.",  # Positive
    "The company is facing significant challenges this year.",  # Negative
    "The financial results are average, with no major surprises."  # Neutral
]

for text in sample_texts:
    prediction = predict_sentiment_bert(text, tokenizer, model)
    logging.info(f"Sample Text: {text}")
    logging.info(f"Predicted Sentiment: {prediction}")


2025-01-26 20:51:28,690 - INFO - Sample Text: The company's profits have surged this quarter.
2025-01-26 20:51:28,691 - INFO - Predicted Sentiment: Positive
2025-01-26 20:51:29,706 - INFO - Sample Text: The company is facing significant challenges this year.
2025-01-26 20:51:29,707 - INFO - Predicted Sentiment: Neutral
2025-01-26 20:51:31,099 - INFO - Sample Text: The financial results are average, with no major surprises.
2025-01-26 20:51:31,103 - INFO - Predicted Sentiment: Neutral
