In [54]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("michiard/sentiment-analysis-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/sentiment-analysis-dataset


In [55]:
import os
import pandas as pd

print(os.listdir(path))

train_df = pd.read_csv(path+'/train.csv')

print(train_df.head())

['sample_submission.csv', 'train.csv', 'test.csv']
       textID                                               text  \
0  28ac06f416                        good luck with your auction   
1  92098cf9a7  Hmm..You can`t judge a book by looking at its ...   
2  7858ff28f2   Hello, yourself. Enjoy London. Watch out for ...   
3  b0c9c67f32         We can`t even call you from belgium  sucks   
4  7b36e9e7a5                                 not so good mood..   

                                       selected_text sentiment  
0                        good luck with your auction  positive  
1  Hmm..You can`t judge a book by looking at its ...   neutral  
2                                    They`re mental.  negative  
3                                            m  suck  negative  
4                                 not so good mood..  negative  


# Preprocessing

## Checking null values

In [57]:
print(train_df.isnull().sum())

textID           0
text             0
selected_text    0
sentiment        0
dtype: int64


##  converting text to lowercase to ensure uniformity

In [58]:
train_df["text"] = train_df["text"].str.lower()

## Removing irrelevant characters (like special symbols or HTML tags)

In [None]:
import re

def clean_text(text):
    text = re.sub(r"http\S+", "", text)           # Remove URLs
    text = re.sub(r"@\w+", "", text)              # Remove mentions
    text = re.sub(r"#\w+", "", text)              # Remove hashtags
    text = re.sub(r"\s+", " ", text).strip()      # Remove extra whitespaces
    return text

train_df["text"] = train_df["text"].apply(clean_text)

## Tokenization

punkt is a pretrained tokenizer model provided by NLTK (Natural Language Toolkit). It’s used by functions like nltk.word_tokenize to split text into words and punctuation tokens properly.

Without it, NLTK can’t perform word tokenization correctly.

In [59]:
from nltk.tokenize import word_tokenize

import nltk
nltk.download('punkt')

# Apply tokenization to each row in the 'text' column
train_df['tokens'] = train_df['text'].apply(word_tokenize)

# Preview the tokenized output
print(train_df[['text', 'tokens']].head())


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


                                                text  \
0                        good luck with your auction   
1  hmm..you can`t judge a book by looking at its ...   
2   hello, yourself. enjoy london. watch out for ...   
3         we can`t even call you from belgium  sucks   
4                                 not so good mood..   

                                              tokens  
0                  [good, luck, with, your, auction]  
1  [hmm, .., you, can, `, t, judge, a, book, by, ...  
2  [hello, ,, yourself, ., enjoy, london, ., watc...  
3  [we, can, `, t, even, call, you, from, belgium...  
4                          [not, so, good, mood, ..]  


In [60]:
!pip install nltk
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')




[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /usr/share/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [61]:
sia = SentimentIntensityAnalyzer()
def get_sentiment_scores(text):
    scores = sia.polarity_scores(text)
    # scores example: {'neg': 0.0, 'neu': 0.453, 'pos': 0.547, 'compound': 0.5719}

    # Determine polarity
    if scores['compound'] >= 0.05:
        polarity = 'positive'
    elif scores['compound'] <= -0.05:
        polarity = 'negative'
    else:
        polarity = 'neutral'

    # Intensity can be the absolute value of compound score
    intensity = abs(scores['compound'])

    return polarity, intensity, scores

train_df[['polarity', 'intensity', 'sentiment_scores']] = train_df['text'].apply(
    lambda x: pd.Series(get_sentiment_scores(x))
)


In [62]:
train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,tokens,polarity,intensity,sentiment_scores
0,28ac06f416,good luck with your auction,good luck with your auction,positive,"[good, luck, with, your, auction]",positive,0.7096,"{'neg': 0.0, 'neu': 0.337, 'pos': 0.663, 'comp..."
1,92098cf9a7,hmm..you can`t judge a book by looking at its ...,Hmm..You can`t judge a book by looking at its ...,neutral,"[hmm, .., you, can, `, t, judge, a, book, by, ...",neutral,0.0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
2,7858ff28f2,"hello, yourself. enjoy london. watch out for ...",They`re mental.,negative,"[hello, ,, yourself, ., enjoy, london, ., watc...",positive,0.4939,"{'neg': 0.0, 'neu': 0.758, 'pos': 0.242, 'comp..."
3,b0c9c67f32,we can`t even call you from belgium sucks,m suck,negative,"[we, can, `, t, even, call, you, from, belgium...",negative,0.3612,"{'neg': 0.263, 'neu': 0.737, 'pos': 0.0, 'comp..."
4,7b36e9e7a5,not so good mood..,not so good mood..,negative,"[not, so, good, mood, ..]",negative,0.3865,"{'neg': 0.466, 'neu': 0.534, 'pos': 0.0, 'comp..."
