# Build a model to classify the sentiment of the text data into positive, negative, or neutral categories.

In [1]:
# Import necessary libraries
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# LOAD DATASET

In [3]:
# 1. Load the dataset
dataset = load_dataset("tweet_eval", "sentiment")

# Extract texts and labels
texts = dataset['train']['text'] + dataset['test']['text']
labels = dataset['train']['label'] + dataset['test']['label']

In [5]:
# 2. Split into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Preprocess (Tokenize & Pad)

In [7]:
# 3. Vectorize using TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)
x_train_tfidf = vectorizer.fit_transform(x_train)
x_test_tfidf = vectorizer.transform(x_test)

# Build LR Model and Train

In [9]:
# 4. Train Logistic Regression model
clf = LogisticRegression(max_iter=1000)
clf.fit(x_train_tfidf, y_train)

# Evaluate

In [19]:
# 5. Evaluate the model
y_pred = clf.predict(x_test_tfidf)

print("Accuracy:", accuracy_score(y_test, y_pred)*100,"% \n")
print("Classification Report:\n", classification_report(y_test, y_pred, target_names=['negative', 'neutral', 'positive']))

Accuracy: 66.53713298791018 % 

Classification Report:
               precision    recall  f1-score   support

    negative       0.65      0.45      0.53      2258
     neutral       0.64      0.77      0.70      5322
    positive       0.72      0.65      0.68      4000

    accuracy                           0.67     11580
   macro avg       0.67      0.62      0.64     11580
weighted avg       0.67      0.67      0.66     11580

