In [4]:
# Sentiment Analysis - Classical NLP Pipeline
# Models: Logistic Regression, Naive Bayes, Linear SVM
# Purpose: Demonstrate end-to-end text classification workflow

In [5]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [30]:
# Create a small labeled dataset

In [31]:
# This dataset is intentionally small to demonstrate
# the full NLP pipeline rather than optimize accuracy.
# Each row contains a short sentence and its sentiment label.

data = pd.DataFrame(
    [
        ("i really enjoyed collaborating with my team on this project", "positive"),
        ("the presentation went smoothly and i felt confident throughout", "positive"),
        ("i finally solved a problem that had been bothering me all week", "positive"),
        ("receiving positive feedback from my manager made my day", "positive"),
        ("i felt relaxed after taking a long walk in the park", "positive"),
        ("the new feature worked exactly as expected", "positive"),
        ("i was happy to help a colleague who was struggling", "positive"),
        ("finishing my tasks early gave me a sense of relief", "positive"),
        ("the meeting was productive and well organized", "positive"),
        ("i felt motivated after learning something new", "positive"),

        ("i felt exhausted after working late again", "negative"),
        ("the system kept failing and delayed my work", "negative"),
        ("i was frustrated by unclear requirements", "negative"),
        ("waiting for a response took longer than expected", "negative"),
        ("the bug reappeared even after multiple fixes", "negative"),
        ("i felt stressed trying to meet a tight deadline", "negative"),
        ("the meeting felt unproductive and confusing", "negative"),
        ("i was disappointed by the lack of communication", "negative"),
        ("dealing with repeated issues was draining", "negative"),
        ("i felt discouraged after the project was canceled", "negative"),
    ],
    columns=["text", "sentiment"]
)

In [32]:
# Shuffle the dataset to avoid any ordering bias

data = data.sample(frac=1).reset_index(drop=True)

In [33]:
# Separate features (X) and labels (y)

x = data['text']
y = data['sentiment']

In [34]:
# Convert text into numerical features using Bag-of-Words

In [35]:
# CountVectorizer converts each sentence into a vector
# where each column represents a word and each value
# represents how many times that word appears.

countvec = CountVectorizer()

In [36]:
countvec_fit = countvec.fit_transform(x)

In [37]:
bag_of_words = pd.DataFrame(
    countvec_fit.toarray(),
    columns = countvec.get_feature_names_out()
)

In [12]:
bag_of_words

Unnamed: 0,after,again,all,and,as,been,bothering,bug,by,canceled,...,walk,was,week,well,went,who,with,work,worked,working
0,0,0,0,0,0,0,0,0,0,0,...,0,2,0,0,0,1,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,1,0,1,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,1,0,0,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
8,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
9,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Train / Test split

In [38]:
# 70% training data, 30% testing data
# random_state is fixed for reproducibility

X_train, X_test, y_train, y_test = train_test_split(
    bag_of_words, 
    y, 
    test_size=0.3, 
    random_state=7
)

## Logistic Regression Model

In [49]:
# Logistic Regression learns a linear decision boundary
# by optimizing probabilities for each class.

In [50]:
lr = LogisticRegression(random_state=1).fit(X_train, y_train)

In [51]:
y_pred_lr = lr.predict(X_test)

In [52]:
accuracy_score(y_pred_lr, y_test)

0.5

In [53]:
print(classification_report(
    y_test,
    y_pred_lr,
    zero_division=0)
     )

              precision    recall  f1-score   support

    negative       0.40      1.00      0.57         2
    positive       1.00      0.25      0.40         4

    accuracy                           0.50         6
   macro avg       0.70      0.62      0.49         6
weighted avg       0.80      0.50      0.46         6



## Naive Bayes

In [54]:
# Multinomial Naive Bayes is commonly used for text classification.
# It uses word frequency statistics and assumes word independence.

In [55]:
from sklearn.naive_bayes import MultinomialNB

In [56]:
nb = MultinomialNB().fit(X_train, y_train)

In [57]:
y_pred_nb = nb.predict(X_test)

In [63]:
accuracy_score(y_pred_nb, y_test)

0.3333333333333333

## Linear Support Vector Machine (Linear SVM)

In [64]:
# SGDClassifier is used here to train a linear SVM efficiently.
# Linear SVM attempts to find the maximum-margin boundary
# between positive and negative classes.

In [65]:
from sklearn.linear_model import SGDClassifier

In [66]:
svm = SGDClassifier().fit(X_train, y_train)

In [67]:
y_pred_svm = svm.predict(X_test)

In [68]:
accuracy_score(y_pred_svm, y_test)

0.5

In [69]:
# Notes:
# - Accuracy is relatively low due to the very small dataset.
# - This is expected and highlights the importance of:
#   * More data
#   * Better feature engineering (e.g., TF-IDF, n-grams)
#   * Improved text preprocessing
# - The goal of this project is to demonstrate the full
#   NLP workflow rather than maximize model performance.