<a href="https://colab.research.google.com/github/neenz16/MachineLearningProjects/blob/main/neenu_sentiment_analysis_scikit_selfstudyprroject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd #data manipulation library
from sklearn.model_selection import train_test_split # Imports a utility to split your dataset into training and testing sets
from sklearn.feature_extraction.text import TfidfVectorizer #Imports TF-IDF Vectorizer, which turns text into numerical values based on how important a word is.
from sklearn.linear_model import LogisticRegression # LR moodel simple and effective classifier for binary tasks like positive/negative sentiment.
from sklearn.metrics import accuracy_score # Imports function to calculate accuracy of the model.how well the model performed on the test set.

# Sample data
data = {
    'text': [
        "I love this movie!",
        "This was a great experience",
        "Absolutely fantastic!",
        "I hated this",
        "It was so boring",
        "Not my type",
        "Amazing and inspiring",
        "Worst film ever",
        "Very enjoyable",
        "Terrible acting",
        "Lovely movie!",
        "Horrible",
        "Love it!",
        "Average",
        "Hate it",

    ],
    'label': [1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0]  # 1 = Positive, 0 = Negative
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42) #80% train, 20% test.

# Vectorize text using TF-IDF ( Converts text → numerical form suitable for the model.)
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train) #fit_transform() learns vocabulary from training data and vectorizes.
X_test_tfidf = vectorizer.transform(X_test) #transform() uses same vocab to vectorize test data.

# Train a Logistic Regression model(Creates a Logistic Regression classifier and trains it on the training data.)
model = LogisticRegression()
model.fit(X_train_tfidf, y_train) # Model learns to associate word patterns with sentiments

# Predict and evaluate(Makes predictions on test data and checks accuracy.)
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)

print("Test Accuracy:", accuracy)

# Try on new text
new_text = ["That movie was awesome!", "I didn’t like the plot."]
new_text_vec = vectorizer.transform(new_text)
predictions = model.predict(new_text_vec)

for text, pred in zip(new_text, predictions):
    sentiment = "Positive" if pred == 1 else "Negative"
    print(f"'{text}' → {sentiment}")


Test Accuracy: 0.3333333333333333
'That movie was awesome!' → Positive
'I didn’t like the plot.' → Positive



**With real dataset forr sentiment anlysis from NLTK**


---



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews
import nltk


In [None]:
# Download movie_reviews if not already
nltk.download('movie_reviews')
movie_reviews.categories()


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


['neg', 'pos']

In [None]:
# Load real movie review data
texts = []
labels = []
for category in movie_reviews.categories():  # Loop over 'pos' and 'neg'
    for fileid in movie_reviews.fileids(category):  # Get all file ids for this category
        texts.append(movie_reviews.raw(fileid))    # Get full review text and add to texts list
        labels.append(1 if category == 'pos' else 0)  # Assign label 1 if positive, else 0 for negative

In [None]:
# Convert to DataFrame
df = pd.DataFrame({'text': texts, 'label': labels})

In [None]:
df

Unnamed: 0,text,label
0,"plot : two teen couples go to a church party ,...",0
1,the happy bastard's quick movie review \ndamn ...,0
2,it is movies like these that make a jaded movi...,0
3,""" quest for camelot "" is warner bros . ' firs...",0
4,synopsis : a mentally unstable man undergoing ...,0
...,...,...
1995,wow ! what a movie . \nit's everything a movie...,1
1996,"richard gere can be a commanding actor , but h...",1
1997,"glory--starring matthew broderick , denzel was...",1
1998,steven spielberg's second epic film on world w...,1


In [None]:
df.shape # (row,col)
# df.shape[0] #roows

(2000, 2)

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [None]:
# # Vectorize text using TF-IDF
# vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
# X_train_tfidf = vectorizer.fit_transform(X_train)
# X_test_tfidf = vectorizer.transform(X_test)

In [None]:
# Vectorize text using TF-IDF with n-grams (1-gram and 2-gram)
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
# Train Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

In [None]:
# Predict and evaluate
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.8225


In [None]:
# Try on new text
new_text = ["That movie was awesome!", "I didn’t like the plot."]
new_text_vec = vectorizer.transform(new_text)
predictions = model.predict(new_text_vec)

for text, pred in zip(new_text, predictions):
    sentiment = "Positive" if pred == 1 else "Negative"
    print(f"'{text}' → {sentiment}")

'That movie was awesome!' → Positive
'I didn’t like the plot.' → Negative





---

Added more text preprocessing:

Lowercasing
Punctuation removal
Lemmatization (using WordNetLemmatizer from NLTK)


---



In [None]:
pip install nltk




In [None]:
# import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')


In [None]:
import pandas as pd
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import string


# Download required NLTK data

nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('wordnet')
nltk.download('omw-1.4')


# Load real movie review data
texts = []
labels = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        texts.append(movie_reviews.raw(fileid))
        labels.append(1 if category == 'pos' else 0)

# Convert to DataFrame
df = pd.DataFrame({'text': texts, 'label': labels})

# Preprocessing function
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()  # Lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = word_tokenize(text)  # Tokenize
    tokens = [lemmatizer.lemmatize(word) for word in tokens]  # Lemmatize
    return ' '.join(tokens)

# Apply preprocessing
df['text'] = df['text'].apply(preprocess_text)

# Split data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Vectorize text using TF-IDF with unigrams and bigrams
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2))
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)
print('------------------------------------------------')
print("Test Accuracy with Preprocessing + N-grams:", accuracy)

# Try on new text
new_text = ["That movie was awesome!", "I didn’t like the plot."]
# Preprocess new text before vectorizing
new_text_processed = [preprocess_text(text) for text in new_text]
new_text_vec = vectorizer.transform(new_text_processed)
predictions = model.predict(new_text_vec)

print('------------------------------------------------')
for text, pred in zip(new_text, predictions):
    sentiment = "Positive" if pred == 1 else "Negative"
    print(f"'{text}' → {sentiment}")


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


------------------------------------------------
Test Accuracy with Preprocessing + N-grams: 0.815
------------------------------------------------
'That movie was awesome!' → Positive
'I didn’t like the plot.' → Negative


* Logistic Regression: Good baseline for binary classification.

* Random Forest: Ensemble of decision trees → reduces overfitting, better accuracy.
* XGBoost: Boosted trees built sequentially to correct mistakes of previous ones, super popular in competitions.
* LightGBM: Similar to XGBoost but faster, uses histogram-based learning, better with large data.


---
Integrate Random Forest and XGBoost to baseline sentiment analysis code:

with preprocessing (lowercase, punctuation removal, lemmatization), n-grams (1,2), stopword removal and RandomForest & XGBoost



---



In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from nltk.corpus import movie_reviews, stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
import nltk

# Download resources
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

# Load data
texts, labels = [], []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        texts.append(movie_reviews.raw(fileid))
        labels.append(1 if category == 'pos' else 0)

df = pd.DataFrame({'text': texts, 'label': labels})

# Text Preprocessing Function
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing
df['text'] = df['text'].apply(preprocess_text)

# Split Data
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# TF-IDF Vectorizer with n-grams
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)
rf_preds = rf_model.predict(X_test_tfidf)
rf_acc = accuracy_score(y_test, rf_preds)
print('------------------------------------------------')
print(f"Random Forest Accuracy: {rf_acc:.4f}")

# XGBoost Model

xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_tfidf, y_train)
xgb_preds = xgb_model.predict(X_test_tfidf)
xgb_acc = accuracy_score(y_test, xgb_preds)
print('------------------------------------------------')
print(f"XGBoost Accuracy: {xgb_acc:.4f}")

# Try on new text
new_text = ["That movie was awesome!", "I didn’t like the plot."]
new_text_preprocessed = [preprocess_text(t) for t in new_text]
new_text_vec = vectorizer.transform(new_text_preprocessed)

# Predictions from both models
rf_sentiments = rf_model.predict(new_text_vec)
xgb_sentiments = xgb_model.predict(new_text_vec)
print('------------------------------------------------')

for text, rf_pred, xgb_pred in zip(new_text, rf_sentiments, xgb_sentiments):
    rf_label = "Positive" if rf_pred == 1 else "Negative"
    xgb_label = "Positive" if xgb_pred == 1 else "Negative"
    print(f"'{text}' → RandomForest: {rf_label}, XGBoost: {xgb_label}")


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


------------------------------------------------
Random Forest Accuracy: 0.8075


Parameters: { "use_label_encoder" } are not used.



------------------------------------------------
XGBoost Accuracy: 0.7950
------------------------------------------------
'That movie was awesome!' → RandomForest: Positive, XGBoost: Positive
'I didn’t like the plot.' → RandomForest: Negative, XGBoost: Negative


---
Implement a professional-grade, pretrained transformer model: BERT using
**HuggingFace Transformers**.

**BERT** (Bidirectional Encoder Representations from Transformers) is a deep learning transformer model pretrained on large text corpora (like Wikipedia) — it understands context better because it reads text both left-to-right and right-to-left.

For sentiment analysis, we can fine-tune or directly use a pretrained sentiment classifier model (like distilbert-base-uncased-finetuned-sst-2-english from HuggingFace).


---



In [None]:
pip install transformers torch


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
import nltk
from nltk.corpus import movie_reviews
from transformers import pipeline

# Download the dataset
nltk.download('movie_reviews')

# Load pretrained sentiment-analysis pipeline (DistilBERT fine-tuned on SST-2)
sentiment_pipeline = pipeline("sentiment-analysis")

# Prepare texts from movie_reviews
texts = []
for fileid in movie_reviews.fileids():
    review_text = movie_reviews.raw(fileid)
    texts.append(review_text[:512])  # truncate to first 512 characters

# Limit for demo (say first 10 for quick run — can remove limit for full dataset)
texts = texts[:10]

# Get predictions
results = sentiment_pipeline(texts)

# Print results
for text, result in zip(texts, results):
    label = result['label']
    score = result['score']
    print(f"Sentiment: {label} (confidence: {score:.2f})\n---\n{text[:300]}...\n")


[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


Sentiment: NEGATIVE (confidence: 0.99)
---
plot : two teen couples go to a church party , drink and then drive . 
they get into an accident . 
one of the guys dies , but his girlfriend continues to see him in her life , and has nightmares . 
what's the deal ? 
watch the movie and " sorta " find out . . . 
critique : a mind-fuck movie for the...

Sentiment: NEGATIVE (confidence: 1.00)
---
the happy bastard's quick movie review 
damn that y2k bug . 
it's got a head start in this movie starring jamie lee curtis and another baldwin brother ( william this time ) in a story regarding a crew of a tugboat that comes across a deserted russian tech ship that has a strangeness to it when they ...

Sentiment: POSITIVE (confidence: 0.92)
---
it is movies like these that make a jaded movie viewer thankful for the invention of the timex indiglo watch . 
based on the late 1960's television show by the same name , the mod squad tells the tale of three reformed criminals under the employ of the police t



---


Hyperparameter Tuning (GridSearchCV)
To find the best parameters for the model - to maximize its performance.

 optimize the model's performance.
find the optimal settings for parameters such as learning rate, number of estimators, max depth, etc.

NOTE:
Comparison to other techniques:

RandomizedSearchCV: Tries random parameter combinations (faster, but less exhaustive)

Bayesian Optimization: Smarter, probabilistic parameter search (like Optuna or Hyperopt)


---



In [None]:
# performing hyperparameter tuning on a LightGBM classifier using GridSearchCV from scikit-learn.
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15]
}

# Initialize GridSearchCV with LightGBM model
grid_search = GridSearchCV(estimator=lgb.LGBMClassifier(), param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print best parameters
print("Best Parameters:", grid_search.best_params_)

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy with Best Params:", accuracy)


Modified code with a real text dataset (sklearn's built-in movie_reviews dataset)
Added the following to check difference or improvements in accurracy scores:
Preprocess the text (lowercase, remove punctuation)
1) Add stop words removal
2) Try with ngram_range=(1,2) for unigrams + bigrams

Show accuracy scores separately:


*   Basic TF-IDF
*   TF-IDF + stopword removal
*   TF-IDF + stopwords + ngram



In [None]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.datasets import fetch_20newsgroups

# Notes:
# pandas — for working with tabular data (like CSV files or dataframes)
# string — provides string operations (used to remove punctuation)
# train_test_split — splits your data into training and testing subsets
# TfidfVectorizer — converts text into numerical TF-IDF vectors
# LogisticRegression — a simple yet effective model for classification tasks
# accuracy_score — to measure how well your model predicts
# fetch_20newsgroups — built-in text dataset loader from sklearn

In [None]:
# Load real dataset (binary classification — 2 categories for simplicity)
categories = ['rec.sport.baseball', 'sci.med']  # Positive-ish & Negative-ish contexts
dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

In [None]:
print(dataset)

In [None]:
# Convert to DataFrame
df = pd.DataFrame({'text': dataset.data, 'label': dataset.target})

In [None]:
# Preprocessing function (lowercase + remove punctuation)
def preprocess(text):
    text = text.lower()  # Lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text



In [None]:
df['clean_text'] = df['text'].apply(preprocess)

In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)

In [None]:
# Basic TF-IDF (no stopwords, no ngrams)
vectorizer_basic = TfidfVectorizer()
X_train_basic = vectorizer_basic.fit_transform(X_train)
X_test_basic = vectorizer_basic.transform(X_test)

model_basic = LogisticRegression(max_iter=500)
model_basic.fit(X_train_basic, y_train)
pred_basic = model_basic.predict(X_test_basic)
accuracy_basic = accuracy_score(y_test, pred_basic)
print(f"Accuracy with Basic TF-IDF: {accuracy_basic:.4f}")

In [None]:
# TF-IDF with stopwords removal
vectorizer_stop = TfidfVectorizer(stop_words='english')
X_train_stop = vectorizer_stop.fit_transform(X_train)
X_test_stop = vectorizer_stop.transform(X_test)

model_stop = LogisticRegression(max_iter=500)
model_stop.fit(X_train_stop, y_train)
pred_stop = model_stop.predict(X_test_stop)
accuracy_stop = accuracy_score(y_test, pred_stop)
print(f"Accuracy with Stopwords Removed: {accuracy_stop:.4f}")


In [None]:
# TF-IDF with stopwords removal + ngrams (unigrams + bigrams)
vectorizer_ngram = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
X_train_ngram = vectorizer_ngram.fit_transform(X_train)
X_test_ngram = vectorizer_ngram.transform(X_test)

model_ngram = LogisticRegression(max_iter=500)
model_ngram.fit(X_train_ngram, y_train)
pred_ngram = model_ngram.predict(X_test_ngram)
accuracy_ngram = accuracy_score(y_test, pred_ngram)
print(f"Accuracy with Stopwords + Ngrams: {accuracy_ngram:.4f}")