In [30]:
# import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression


In [31]:
# load data
fulltrain = pd.read_csv("fulltrain.csv", names=['label', 'text'])
balancedtest = pd.read_csv("balancedtest.csv", names=['label', 'text'])

# Print number of rows per dataset
print(f"fulltrain:  Loaded {len(fulltrain.index)} rows")
print(f"balancedtest: Loaded {len(balancedtest.index)} rows")

fulltrain:  Loaded 48854 rows
balancedtest: Loaded 3000 rows


In [32]:
# process training data
trainX = fulltrain['text']
trainY = fulltrain['label']

# feature engineering, tf-idf
vectorizer = TfidfVectorizer()
vectorizer.fit(trainX)
trainX = vectorizer.transform(trainX)
print(trainX.shape)
print(trainY.shape)

(48854, 229597)
(48854,)


In [33]:
# fit model based on test data, 
# params are chosen based on hyperparam tuning using GridSearchCV from sklearn
# default max_iter leads to underfit, thus set to 5000
model = LogisticRegression(C=10, penalty='l2', solver='liblinear', max_iter=5000)
model.fit(trainX, trainY)


LogisticRegression(C=10, max_iter=5000, solver='liblinear')

In [34]:
# process test data
testX = balancedtest['text']
testY = balancedtest['label']
testX = vectorizer.transform(testX)

In [35]:
from sklearn.metrics import classification_report, f1_score

# test model
result = model.predict(testX)
print(classification_report(testY, result))
print(f1_score(y_pred=result, y_true=testY,average = "macro"))

              precision    recall  f1-score   support

           1       0.87      0.80      0.83       750
           2       0.82      0.39      0.53       750
           3       0.57      0.83      0.68       750
           4       0.80      0.93      0.86       750

    accuracy                           0.74      3000
   macro avg       0.77      0.74      0.72      3000
weighted avg       0.77      0.74      0.72      3000

0.7247357875908803
