In [4]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score

# https://huggingface.co/ahmedrachid/FinancialBERT-Sentiment-Analysis


# Improved reading the dataset with encoding handling
try:
    with open('data/sentiment140.csv', 'r', encoding='utf-8') as file:
        df = pd.read_csv(file, header=None)
except UnicodeDecodeError:
    with open('data/sentiment140.csv', 'r', encoding='latin1') as file:
        df = pd.read_csv(file, header=None)

# Clean the tweets
def clean_tweet(tweet):
    tweet = str(tweet)  # Ensure the tweet is a string
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    tweet = re.sub(r'\@\w+|\#', '', tweet)
    tweet = re.sub(r'[^\w\s]', '', tweet)
    tweet = tweet.strip().lower()
    return tweet

# Assuming text data is in the second column (index 1) and labels are in the first column (index 0)
df['tweet'] = df[1].apply(clean_tweet)
df['label'] = df[0]

# Extract features using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(df['tweet']).toarray()
y = df['label']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# Train the XGBoost model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', num_class=3)  # Set num_class=3 for multi-class classification
xgb_model.fit(X_train, y_train)

# Predict on the validation set
y_pred = xgb_model.predict(X_val)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.4f}')


ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got [0 4]