In [None]:
import pandas as pd

df = pd.read_csv("../data/sentimentdataset.csv")
df.head()

FileNotFoundError: [Errno 2] No such file or directory: './data/sentimentdataset.csv'

In [None]:
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0.1  732 non-null    int64  
 1   Unnamed: 0    732 non-null    int64  
 2   Text          732 non-null    object 
 3   Sentiment     732 non-null    object 
 4   Timestamp     732 non-null    object 
 5   User          732 non-null    object 
 6   Platform      732 non-null    object 
 7   Hashtags      732 non-null    object 
 8   Retweets      732 non-null    float64
 9   Likes         732 non-null    float64
 10  Country       732 non-null    object 
 11  Year          732 non-null    int64  
 12  Month         732 non-null    int64  
 13  Day           732 non-null    int64  
 14  Hour          732 non-null    int64  
dtypes: float64(2), int64(6), object(7)
memory usage: 85.9+ KB


In [None]:
df['Sentiment'].value_counts()


Sentiment
Positive               44
Joy                    42
Excitement             32
Happy                  14
Neutral                14
                       ..
Vibrancy                1
Culinary Adventure      1
Mesmerizing             1
Thrilling Journey       1
Winter Magic            1
Name: count, Length: 279, dtype: int64

In [None]:
df.isnull().sum()

Unnamed: 0.1    0
Unnamed: 0      0
Text            0
Sentiment       0
Timestamp       0
User            0
Platform        0
Hashtags        0
Retweets        0
Likes           0
Country         0
Year            0
Month           0
Day             0
Hour            0
dtype: int64

In [None]:
for i in range(5):
    print(df["Text"][i])
    print("------")

 Enjoying a beautiful day at the park!              
------
 Traffic was terrible this morning.                 
------
 Just finished an amazing workout! üí™               
------
 Excited about the upcoming weekend getaway!        
------
 Trying out a new recipe for dinner tonight.        
------


In [None]:
%pip install nltk


Note: you may need to restart the kernel to use updated packages.


In [None]:
import nltk
import string

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

**NLTK** stands for **Natural Language Toolkit**.

### What it means

* **Natural Language** ‚Üí Human languages like English, Hindi, Spanish, etc.
* **Toolkit** ‚Üí A collection of tools and libraries.

So, **NLTK = A toolkit for working with human language data**.

---

### What NLTK is used for in NLP

NLTK is a **Python library** used for basic and advanced NLP tasks, such as:

1. **Tokenization**
   Splitting text into words or sentences.

   ```python
   from nltk.tokenize import word_tokenize
   word_tokenize("I love NLP")
   ```

2. **Stopword removal**
   Removing common words like ‚Äúthe‚Äù, ‚Äúis‚Äù, ‚Äúand‚Äù.

3. **Stemming**
   Reducing words to root form.

   * playing ‚Üí play
   * running ‚Üí run

4. **Lemmatization**
   Converting words to their meaningful base form.

   * better ‚Üí good
   * running ‚Üí run

5. **Part-of-speech tagging**
   Identifying nouns, verbs, adjectives, etc.

---

### Simple interview definition

> **NLTK (Natural Language Toolkit)** is a Python library used for processing and analyzing human language data. It provides tools for tokenization, stemming, lemmatization, stopword removal, and other NLP tasks.

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nishi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nishi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nishi\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# stop_words = set(stopwords.words('english'))
# lemmatizer = WordNetLemmatizer()

# def preprocess_text(text):
#     # Convert to lowercase
#     text = text.lower()
    
#     # Remove punctuation
#     text = text.translate(str.maketrans('', '', string.punctuation))
    
#     # Tokenize
#     tokens = word_tokenize(text)
    
#     # Remove stop words and lemmatize
#     processed_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
#     return ' '.join(processed_tokens)

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens   # MUST return a list


In [None]:
for i in range(5):
    print("Original:", df["Text"][i])
    print("Preprocessed:", preprocess_text(df["Text"][i]))
    print("------")

Original:  Enjoying a beautiful day at the park!              
Preprocessed: ['enjoying', 'beautiful', 'day', 'park']
------
Original:  Traffic was terrible this morning.                 
Preprocessed: ['traffic', 'terrible', 'morning']
------
Original:  Just finished an amazing workout! üí™               
Preprocessed: ['finished', 'amazing', 'workout', 'üí™']
------
Original:  Excited about the upcoming weekend getaway!        
Preprocessed: ['excited', 'upcoming', 'weekend', 'getaway']
------
Original:  Trying out a new recipe for dinner tonight.        
Preprocessed: ['trying', 'new', 'recipe', 'dinner', 'tonight']
------


This shows:

Lowercasing

Stopwords removed

Punctuation removed

Words normalized

‚ÄúWhat is preprocessing in NLP?‚Äù

You can say:

‚ÄúIt‚Äôs the process of converting raw text into a clean and normalized form using techniques like tokenization, stopword removal, and lemmatization, so that it can be used by machine learning models.‚Äù

Machine learning models cannot understand text directly.
They only understand numbers.

So now we convert text like:

"beautiful day park"


into a numerical vector using TF-IDF.

This is my first real feature engineering step in NLP!!

In [None]:
print(preprocess_text(df["Text"][0]))
print(type(preprocess_text(df["Text"][0])))


['enjoying', 'beautiful', 'day', 'park']
<class 'list'>


In [None]:
df["clean_text"] = df["Text"].apply(
    lambda x: " ".join(preprocess_text(x))
)


In [None]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,Text,Sentiment,Timestamp,User,Platform,Hashtags,Retweets,Likes,Country,Year,Month,Day,Hour,clean_text
0,0,0,Enjoying a beautiful day at the park! ...,Positive,2023-01-15 12:30:00,User123,Twitter,#Nature #Park,15.0,30.0,USA,2023,1,15,12,enjoying beautiful day park
1,1,1,Traffic was terrible this morning. ...,Negative,2023-01-15 08:45:00,CommuterX,Twitter,#Traffic #Morning,5.0,10.0,Canada,2023,1,15,8,traffic terrible morning
2,2,2,Just finished an amazing workout! üí™ ...,Positive,2023-01-15 15:45:00,FitnessFan,Instagram,#Fitness #Workout,20.0,40.0,USA,2023,1,15,15,finished amazing workout üí™
3,3,3,Excited about the upcoming weekend getaway! ...,Positive,2023-01-15 18:20:00,AdventureX,Facebook,#Travel #Adventure,8.0,15.0,UK,2023,1,15,18,excited upcoming weekend getaway
4,4,4,Trying out a new recipe for dinner tonight. ...,Neutral,2023-01-15 19:55:00,ChefCook,Instagram,#Cooking #Food,12.0,25.0,Australia,2023,1,15,19,trying new recipe dinner tonight


In [None]:
df[["Text", "clean_text"]].head()


Unnamed: 0,Text,clean_text
0,Enjoying a beautiful day at the park! ...,enjoying beautiful day park
1,Traffic was terrible this morning. ...,traffic terrible morning
2,Just finished an amazing workout! üí™ ...,finished amazing workout üí™
3,Excited about the upcoming weekend getaway! ...,excited upcoming weekend getaway
4,Trying out a new recipe for dinner tonight. ...,trying new recipe dinner tonight


In [None]:
%pip install scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer

Note: you may need to restart the kernel to use updated packages.


In [None]:
df["clean_text"].head(10)


0              enjoying beautiful day park
1                 traffic terrible morning
2               finished amazing workout üí™
3         excited upcoming weekend getaway
4         trying new recipe dinner tonight
5       feeling grateful little thing life
6    rainy day call cozy blanket hot cocoa
7              new movie release mustwatch
8    political discussion heating timeline
9            missing summer vibe beach day
Name: clean_text, dtype: object

In [None]:
(df["clean_text"] == "").sum()


np.int64(0)

In [None]:
df = df[df["clean_text"].str.strip() != ""]


In [None]:
print(df.shape)


(732, 16)


In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df["clean_text"])

In [None]:
y = df["Sentiment"]


In [None]:
print(X.shape)

(732, 2293)


Why this fix is correct (interview angle)

In real NLP pipelines:

Some texts become empty after cleaning

These rows must be removed

Otherwise models fail

You can say in interviews:

‚ÄúAfter preprocessing, I removed rows that became empty to avoid issues during vectorization.‚Äù

TF-IDF converts text into numbers using:

TF (Term Frequency): how often a word appears

IDF (Inverse Document Frequency): how rare the word is

So:

Common words ‚Üí lower weight

Rare, meaningful words ‚Üí higher weight

Interview explanation (very important)

If asked:

‚ÄúWhat is TF-IDF?‚Äù

You can say:

‚ÄúTF-IDF is a feature extraction technique that converts text into numerical vectors by measuring how important a word is in a document relative to the entire dataset.‚Äù

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train size:", X_train.shape)
print("Test size:", X_test.shape)


Train size: (585, 2293)
Test size: (147, 2293)


In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)


0,1,2
,"penalty  penalty: {'l1', 'l2', 'elasticnet', None}, default='l2' Specify the norm of the penalty: - `None`: no penalty is added; - `'l2'`: add a L2 penalty term and it is the default choice; - `'l1'`: add a L1 penalty term; - `'elasticnet'`: both L1 and L2 penalty terms are added. .. warning::  Some penalties may not work with some solvers. See the parameter  `solver` below, to know the compatibility between the penalty and  solver. .. versionadded:: 0.19  l1 penalty with SAGA solver (allowing 'multinomial' + L1) .. deprecated:: 1.8  `penalty` was deprecated in version 1.8 and will be removed in 1.10.  Use `l1_ratio` instead. `l1_ratio=0` for `penalty='l2'`, `l1_ratio=1` for  `penalty='l1'` and `l1_ratio` set to any float between 0 and 1 for  `'penalty='elasticnet'`.",'deprecated'
,"C  C: float, default=1.0 Inverse of regularization strength; must be a positive float. Like in support vector machines, smaller values specify stronger regularization. `C=np.inf` results in unpenalized logistic regression. For a visual example on the effect of tuning the `C` parameter with an L1 penalty, see: :ref:`sphx_glr_auto_examples_linear_model_plot_logistic_path.py`.",1.0
,"l1_ratio  l1_ratio: float, default=0.0 The Elastic-Net mixing parameter, with `0 <= l1_ratio <= 1`. Setting `l1_ratio=1` gives a pure L1-penalty, setting `l1_ratio=0` a pure L2-penalty. Any value between 0 and 1 gives an Elastic-Net penalty of the form `l1_ratio * L1 + (1 - l1_ratio) * L2`. .. warning::  Certain values of `l1_ratio`, i.e. some penalties, may not work with some  solvers. See the parameter `solver` below, to know the compatibility between  the penalty and solver. .. versionchanged:: 1.8  Default value changed from None to 0.0. .. deprecated:: 1.8  `None` is deprecated and will be removed in version 1.10. Always use  `l1_ratio` to specify the penalty type.",0.0
,"dual  dual: bool, default=False Dual (constrained) or primal (regularized, see also :ref:`this equation `) formulation. Dual formulation is only implemented for l2 penalty with liblinear solver. Prefer `dual=False` when n_samples > n_features.",False
,"tol  tol: float, default=1e-4 Tolerance for stopping criteria.",0.0001
,"fit_intercept  fit_intercept: bool, default=True Specifies if a constant (a.k.a. bias or intercept) should be added to the decision function.",True
,"intercept_scaling  intercept_scaling: float, default=1 Useful only when the solver `liblinear` is used and `self.fit_intercept` is set to `True`. In this case, `x` becomes `[x, self.intercept_scaling]`, i.e. a ""synthetic"" feature with constant value equal to `intercept_scaling` is appended to the instance vector. The intercept becomes ``intercept_scaling * synthetic_feature_weight``. .. note::  The synthetic feature weight is subject to L1 or L2  regularization as all other features.  To lessen the effect of regularization on synthetic feature weight  (and therefore on the intercept) `intercept_scaling` has to be increased.",1
,"class_weight  class_weight: dict or 'balanced', default=None Weights associated with classes in the form ``{class_label: weight}``. If not given, all classes are supposed to have weight one. The ""balanced"" mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as ``n_samples / (n_classes * np.bincount(y))``. Note that these weights will be multiplied with sample_weight (passed through the fit method) if sample_weight is specified. .. versionadded:: 0.17  *class_weight='balanced'*",
,"random_state  random_state: int, RandomState instance, default=None Used when ``solver`` == 'sag', 'saga' or 'liblinear' to shuffle the data. See :term:`Glossary ` for details.",
,"solver  solver: {'lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'}, default='lbfgs' Algorithm to use in the optimization problem. Default is 'lbfgs'. To choose a solver, you might want to consider the following aspects: - 'lbfgs' is a good default solver because it works reasonably well for a wide  class of problems. - For :term:`multiclass` problems (`n_classes >= 3`), all solvers except  'liblinear' minimize the full multinomial loss, 'liblinear' will raise an  error. - 'newton-cholesky' is a good choice for  `n_samples` >> `n_features * n_classes`, especially with one-hot encoded  categorical features with rare categories. Be aware that the memory usage  of this solver has a quadratic dependency on `n_features * n_classes`  because it explicitly computes the full Hessian matrix. - For small datasets, 'liblinear' is a good choice, whereas 'sag'  and 'saga' are faster for large ones; - 'liblinear' can only handle binary classification by default. To apply a  one-versus-rest scheme for the multiclass setting one can wrap it with the  :class:`~sklearn.multiclass.OneVsRestClassifier`. .. warning::  The choice of the algorithm depends on the penalty chosen (`l1_ratio=0`  for L2-penalty, `l1_ratio=1` for L1-penalty and `0 < l1_ratio < 1` for  Elastic-Net) and on (multinomial) multiclass support:  ================= ======================== ======================  solver l1_ratio multinomial multiclass  ================= ======================== ======================  'lbfgs' l1_ratio=0 yes  'liblinear' l1_ratio=1 or l1_ratio=0 no  'newton-cg' l1_ratio=0 yes  'newton-cholesky' l1_ratio=0 yes  'sag' l1_ratio=0 yes  'saga' 0<=l1_ratio<=1 yes  ================= ======================== ====================== .. note::  'sag' and 'saga' fast convergence is only guaranteed on features  with approximately the same scale. You can preprocess the data with  a scaler from :mod:`sklearn.preprocessing`. .. seealso::  Refer to the :ref:`User Guide ` for more  information regarding :class:`LogisticRegression` and more specifically the  :ref:`Table `  summarizing solver/penalty supports. .. versionadded:: 0.17  Stochastic Average Gradient (SAG) descent solver. Multinomial support in  version 0.18. .. versionadded:: 0.19  SAGA solver. .. versionchanged:: 0.22  The default solver changed from 'liblinear' to 'lbfgs' in 0.22. .. versionadded:: 1.2  newton-cholesky solver. Multinomial support in version 1.6.",'lbfgs'


In [None]:
y_pred = model.predict(X_test)


In [None]:
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.11564625850340136

Classification Report:

                        precision    recall  f1-score   support

         Acceptance          0.00      0.00      0.00         2
           Admiration        0.00      0.00      0.00         1
        Admiration           0.00      0.00      0.00         1
         Affection           0.00      0.00      0.00         1
      Ambivalence            0.00      0.00      0.00         1
         Anger               0.00      0.00      0.00         1
        Anticipation         0.00      0.00      0.00         1
        Arousal              0.00      0.00      0.00         3
                  Awe        0.00      0.00      0.00         1
         Awe                 0.00      0.00      0.00         1
                  Bad        0.00      0.00      0.00         1
             Betrayal        0.00      0.00      0.00         2
        Betrayal             0.00      0.00      0.00         1
         Bitter              0.00      0.00     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


‚ÄúExplain your NLP project.‚Äù

You can say:

‚ÄúI built a sentiment classification system using a classical NLP pipeline. I performed preprocessing with tokenization, stopword removal, and lemmatization, then converted text into TF-IDF vectors and trained a Logistic Regression model to classify sentiments.‚Äù