In [1235]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [1236]:
data = pd.DataFrame([("i love spending time with my friends and family", "positive"),
                     ("that was the best meal i've ever had in my life", "positive"),
                     ("i feel so grateful for everything i have in my life", "positive"),
                     ("i received a promotion at work and i couldn't be happier", "positive"),
                     ("watching a beautiful sunset always fills me with joy", "positive"),
                     ("my partner surprised me with a thoughtful gift and it made my day", "positive"),
                     ("i am so proud of my daughter for graduating with honors", "positive"),
                     ("listening to my favorite music always puts me in a good mood", "positive"),
                     ("i love the feeling of accomplishment after completing a challenging task", "positive"),
                     ("i am excited to go on vacation next week", "positive"),
                     ("i feel so overwhelmed with work and responsibilities", "negative"),
                     ("the traffic during my commute is always so frustrating", "negative"),
                     ("i received a parking ticket and it ruined my day", "negative"),
                     ("i got into an argument with my partner and we're not speaking", "negative"),
                     ("i have a headache and i feel terrible", "negative"),
                     ("i received a rejection letter for the job i really wanted", "negative"),
                     ("my car broke down and it's going to be expensive to fix", "negative"),
                     ("i'm feeling sad because i miss my friends who live far away", "negative"),
                     ("i'm frustrated because i can't seem to make progress on my project", "negative"),
                     ("i'm disappointed because my team lost the game", "negative")
                    ],
                    columns=['text', 'sentiment'])

In [1237]:
data.head()

Unnamed: 0,text,sentiment
0,i love spending time with my friends and family,positive
1,that was the best meal i've ever had in my life,positive
2,i feel so grateful for everything i have in my...,positive
3,i received a promotion at work and i couldn't ...,positive
4,watching a beautiful sunset always fills me wi...,positive


In [1238]:
# SHUFFLE THE DATASET
# `drop=True` - do not keep old index as a separate column.
data = data.sample(frac = 1).reset_index(drop = True)
data.head()

Unnamed: 0,text,sentiment
0,i got into an argument with my partner and we'...,negative
1,i'm feeling sad because i miss my friends who ...,negative
2,i am so proud of my daughter for graduating wi...,positive
3,i am excited to go on vacation next week,positive
4,i'm frustrated because i can't seem to make pr...,negative


In [1239]:
# PREPARE INPUTS
col_text = data['text']
col_sentiment = data['sentiment']

In [1240]:
# TEXT VECTORISATION (BAG OF WORDS)
# Each word in the dataset is represented as a feature/column with `CountVectorizer`.
countvec = CountVectorizer()
countvec_fit = countvec.fit_transform(col_text)

bag_of_words = pd.DataFrame(
    countvec_fit.toarray(),
    columns=countvec.get_feature_names_out()
)

bag_of_words.get

<bound method NDFrame.get of     accomplishment  after  always  am  an  and  argument  at  away  be  ...  \
0                0      0       0   0   1    1         1   0     0   0  ...   
1                0      0       0   0   0    0         0   0     1   0  ...   
2                0      0       0   1   0    0         0   0     0   0  ...   
3                0      0       0   1   0    0         0   0     0   0  ...   
4                0      0       0   0   0    0         0   0     0   0  ...   
5                0      0       0   0   0    1         0   1     0   1  ...   
6                1      1       0   0   0    0         0   0     0   0  ...   
7                0      0       0   0   0    1         0   0     0   1  ...   
8                0      0       0   0   0    0         0   0     0   0  ...   
9                0      0       0   0   0    1         0   0     0   0  ...   
10               0      0       1   0   0    0         0   0     0   0  ...   
11               0     

In [1241]:
# KEY STEP: SPLIT DATA INTO TRAINING AND TEST SETS
# Training set: used to train the model
# Test set: used to evaluate the model

text_train, text_test, sentiment_train, sentiment_test = train_test_split(
    bag_of_words,
    col_sentiment,
    # 30% for testing (20-30% in practice), 70% for training
    test_size=0.3,
    # set a random state for reproducibility
    random_state=7
)

# Logistic regression

In [1242]:
# `fit` means that we train the model
lr = LogisticRegression(random_state=1).fit(text_train, sentiment_train)

In [1243]:
sentiment_pred_lr = lr.predict(text_test)
print(sentiment_pred_lr)
print(sentiment_test)

['positive' 'negative' 'positive' 'negative' 'positive' 'positive']
1     negative
17    negative
2     positive
5     positive
11    positive
0     negative
Name: sentiment, dtype: object


In [1244]:
accuracy = accuracy_score(sentiment_pred_lr, sentiment_test)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.50


In [1245]:
# DETAILED CLASSIFICATION REPORT
# It shows how the model performed for each class (precision, recall, F1-score)
# F1-score is the harmonic mean of precision and recall
# recall is the ability of the model to find all the relevant cases (true positives)
# precision is the ability of the model to return only relevant cases
print(classification_report(y_test, y_pred_lr, zero_division=0))

              precision    recall  f1-score   support

    negative       0.20      1.00      0.33         1
    positive       1.00      0.20      0.33         5

    accuracy                           0.33         6
   macro avg       0.60      0.60      0.33         6
weighted avg       0.87      0.33      0.33         6



# Naive Bayes


In [1246]:
from sklearn.naive_bayes import MultinomialNB

In [1247]:
# Train the model inputting the training data
nb = MultinomialNB().fit(text_train, sentiment_train)

In [1248]:
# Create predictions
sentiment_pred_nb = nb.predict(text_test)
print(sentiment_pred_nb)

['positive' 'negative' 'positive' 'negative' 'positive' 'positive']


In [1249]:
accuracy_nb = accuracy_score(sentiment_pred_nb, sentiment_test)
print(f"Naive Bayes Accuracy: {accuracy_nb:.2f}")

Naive Bayes Accuracy: 0.50


# Linear Support Vector Machine

In [1250]:
from sklearn.linear_model import SGDClassifier

In [1251]:
# Train the model inputting the training data
svm = SGDClassifier().fit(text_train, sentiment_train)

In [1252]:
sentiment_pred_svm = svm.predict(text_test)
print(sentiment_pred_svm)

['positive' 'positive' 'positive' 'negative' 'positive' 'positive']


In [1253]:
accuracy_svm = accuracy_score(sentiment_pred_svm, sentiment_test)
print(f"SVM Accuracy: {accuracy_svm:.2f}")

SVM Accuracy: 0.33


In [1254]:
print("Logistic Regression Classification Report:")
print(classification_report(sentiment_test, sentiment_pred_svm, zero_division=0))

Logistic Regression Classification Report:
              precision    recall  f1-score   support

    negative       0.00      0.00      0.00         3
    positive       0.40      0.67      0.50         3

    accuracy                           0.33         6
   macro avg       0.20      0.33      0.25         6
weighted avg       0.20      0.33      0.25         6

