In [1]:
# Import libraries we need
# We use pandas to work with data in a table (like Excel).
# We also use NLTK for Natural Language Processing tools.

import pandas as pd
import nltk




In [2]:
# Step 1: Create a small text dataset (a mini "corpus") for sentiment analysis


data = {
    "doc_id": [1, 2, 3, 4, 5, 6],
    "text": [
        "I absolutely loved the lecture today!",
        "The assignment instructions were confusing and stressful.",
        "The lab was okay, not too hard, not too easy.",
        "This course is amazing — I’m learning a lot.",
        "I hate long commutes, but the class itself is great.",
        "The quiz was unfair and I am disappointed."
    ]
}

# Convert the dictionary into a DataFrame (table)
df = pd.DataFrame(data)

# Display the DataFrame so we can see what we created
df


Unnamed: 0,doc_id,text
0,1,I absolutely loved the lecture today!
1,2,The assignment instructions were confusing and...
2,3,"The lab was okay, not too hard, not too easy."
3,4,This course is amazing — I’m learning a lot.
4,5,"I hate long commutes, but the class itself is ..."
5,6,The quiz was unfair and I am disappointed.


In [4]:
# Step 2: Download and load VADER sentiment resources
# VADER stands for: Valence Aware Dictionary and sEntiment Reasoner.

# nltk.download(...) downloads resources (like installing a package/data file).
# It does NOT compute sentiment by itself. We still need to create an analyzer and call it on text.

nltk.download("vader_lexicon")

from nltk.sentiment import SentimentIntensityAnalyzer

# Create the sentiment analyzer object
analyzer = SentimentIntensityAnalyzer()


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [5]:
# Step 3: Compute sentiment scores for each text
# VADER returns 4 scores:
# - 'neg' : proportion of negative sentiment
# - 'neu' : proportion of neutral sentiment
# - 'pos' : proportion of positive sentiment
# - 'compound' : overall sentiment score from -1 (very negative) to +1 (very positive)
#
# We will create new columns in our DataFrame for these scores.

# Apply analyzer.polarity_scores to each text row
scores = df["text"].apply(analyzer.polarity_scores)

# 'scores' is a series of dictionaries.
# We can turn it into a DataFrame so each key becomes a column.
scores_df = pd.DataFrame(list(scores))

# Join the scores back to the original df
df_scored = pd.concat([df, scores_df], axis=1)

# Display results
df_scored


Unnamed: 0,doc_id,text,neg,neu,pos,compound
0,1,I absolutely loved the lecture today!,0.0,0.471,0.529,0.6689
1,2,The assignment instructions were confusing and...,0.51,0.49,0.0,-0.6369
2,3,"The lab was okay, not too hard, not too easy.",0.191,0.555,0.254,-0.0541
3,4,This course is amazing — I’m learning a lot.,0.0,0.612,0.388,0.5859
4,5,"I hate long commutes, but the class itself is ...",0.157,0.467,0.377,0.6486
5,6,The quiz was unfair and I am disappointed.,0.554,0.446,0.0,-0.7351


In [6]:
# Step 4: Create a simple sentiment label
# A common beginner-friendly rule using VADER compound score:
# - compound >=  0.05  -> Positive
# - compound <= -0.05  -> Negative
# - otherwise          -> Neutral
#
# These thresholds are widely used as a simple default for VADER.

def label_sentiment(compound_score: float) -> str:
    if compound_score >= 0.05:
        return "Positive"
    elif compound_score <= -0.05:
        return "Negative"
    else:
        return "Neutral"

# Create a new column called 'sentiment_label'
df_scored["sentiment_label"] = df_scored["compound"].apply(label_sentiment)

# Display a few rows to confirm
df_scored[["doc_id", "text", "compound", "sentiment_label"]]

#df_scored.head()

Unnamed: 0,doc_id,text,compound,sentiment_label
0,1,I absolutely loved the lecture today!,0.6689,Positive
1,2,The assignment instructions were confusing and...,-0.6369,Negative
2,3,"The lab was okay, not too hard, not too easy.",-0.0541,Negative
3,4,This course is amazing — I’m learning a lot.,0.5859,Positive
4,5,"I hate long commutes, but the class itself is ...",0.6486,Positive
5,6,The quiz was unfair and I am disappointed.,-0.7351,Negative


In [7]:
# Step 5: Let's see how many texts are Positive/Neutral/Negative.

label_counts = df_scored["sentiment_label"].value_counts()
label_counts


Unnamed: 0_level_0,count
sentiment_label,Unnamed: 1_level_1
Positive,3
Negative,3
