In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


df = pd.read_csv("train.txt", sep=';', header=None, names=['text', 'emotion'])

df['label'] = df['emotion'].astype('category').cat.codes


X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['label'], test_size=0.2, random_state=42)


vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


model = LogisticRegression(max_iter=1000)
model.fit(X_train_vec, y_train)


print(classification_report(y_test, model.predict(X_test_vec)))


              precision    recall  f1-score   support

           0       0.89      0.83      0.86       427
           1       0.87      0.80      0.83       397
           2       0.83      0.96      0.89      1021
           3       0.89      0.65      0.75       296
           4       0.90      0.94      0.92       946
           5       0.90      0.53      0.67       113

    accuracy                           0.87      3200
   macro avg       0.88      0.78      0.82      3200
weighted avg       0.87      0.87      0.87      3200

