In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification

df = pd.read_csv('tweets (6).csv')

# Hugging Face ProsusAI model
pipe = pipeline("text-classification", model="ProsusAI/finbert")

tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

# Using 'Text' column for sentiment labelling
if 'label' not in df.columns:
    def get_sentiment(text):
        result = pipe(str(text))
        return result[0]['label'].lower()
    
    df['label'] = df['Text'].apply(get_sentiment)

# Balancing the classes
label_counts = df['label'].value_counts()
min_class = label_counts.idxmin()
max_class = label_counts.idxmax()

if label_counts[min_class] / label_counts[max_class] < 0.5:  # Imbalance threshold
    oversample = RandomOverSampler()
    df_balanced, _ = oversample.fit_resample(df, df['label'])
else:
    df_balanced = df

# Splitting data into train, validation and test sets
train, temp = train_test_split(df_balanced, test_size=0.3, stratify=df_balanced['label'], random_state=42)
val, test = train_test_split(temp, test_size=0.5, stratify=temp['label'], random_state=42)

# Sanity check for label distribution
def verify_labels(data, name):
    print(f"{name} set label distribution:\n", data['label'].value_counts(normalize=True))

verify_labels(train, "Training")
verify_labels(val, "Validation")
verify_labels(test, "Test")

train.to_csv("train_data.csv", index=False)
val.to_csv("val_data.csv", index=False)
test.to_csv("test_data.csv", index=False)

print("Data processing completed")

Device set to use cpu


Training set label distribution:
 label
neutral     0.333529
negative    0.333529
positive    0.332942
Name: proportion, dtype: float64
Validation set label distribution:
 label
positive    0.334247
neutral     0.334247
negative    0.331507
Name: proportion, dtype: float64
Test set label distribution:
 label
positive    0.334247
negative    0.334247
neutral     0.331507
Name: proportion, dtype: float64
Data processing completed
