#Twitter Sentiment Analysis

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.1-py3-none-any.whl (590 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/590.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m583.7/590.6 kB[0m [31m20.4 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m590.6/590.6 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.1


In [2]:
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report

import re

## Load Sentiment140 data

In [3]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/twitter_dataset.csv', encoding='latin-1', header=None)
df = df[[0, 5]]
df.columns = ['polarity', 'text']
print(df.head())

   polarity                                               text
0         0  @switchfoot http://twitpic.com/2y1zl - Awww, t...
1         0  is upset that he can't update his Facebook by ...
2         0  @Kenichan I dived many times for the ball. Man...
3         0    my whole body feels itchy and like its on fire 
4         0  @nationwideclass no, it's not behaving at all....


## Data Preprocessing

In [4]:
# Keep only positive (4) and negative (0)
df = df[df.polarity != 2]
df['polarity'] = df['polarity'].map({0: 0, 4: 1})
print(df['polarity'].value_counts())

polarity
0    800000
1    800000
Name: count, dtype: int64


In [5]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [6]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [10]:
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from joblib import Parallel, delayed
import multiprocessing
from emoji import demojize

# --- Setup ---
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

# Precompile regex (single pattern)
pattern = re.compile(
    r'(@\w+)'                # mentions
    r'|(#\w+)'               # hashtags
    r'|(http\S+|www\S+)'     # URLs
    r'|(\b(?:not|n\'t|never|no)\s+\w+)'  # negations
)

# Translation table for fast punctuation removal
punct_table = str.maketrans('', '', string.punctuation)

def replace_tokens(match):
    if match.group(1):  # mention
        return ' <USER> '
    elif match.group(2):  # hashtag
        return ' <HASHTAG> '
    elif match.group(3):  # URL
        return ' <URL> '
    elif match.group(4):  # negation
        return match.group(4).replace(' ', '_')
    return ''

def clean_text(text):
    text = text.lower()
    text = demojize(text)  # optional: 🙂 -> :slightly_smiling_face:
    text = pattern.sub(replace_tokens, text)
    text = text.translate(punct_table)  # remove punctuation efficiently

    tokens = word_tokenize(text)
    tokens = [
        stemmer.stem(lemmatizer.lemmatize(word))
        for word in tokens
        if word not in stop_words
    ]
    return ' '.join(tokens)

# Apply in parallel
num_cores = multiprocessing.cpu_count()
df['clean_text'] = Parallel(n_jobs=num_cores)(
    delayed(clean_text)(text) for text in df['text']
)

print(df[['text', 'clean_text']].head())

                                                text  \
0  @switchfoot http://twitpic.com/2y1zl - Awww, t...   
1  is upset that he can't update his Facebook by ...   
2  @Kenichan I dived many times for the ball. Man...   
3    my whole body feels itchy and like its on fire    
4  @nationwideclass no, it's not behaving at all....   

                                          clean_text  
0  user url url that bummer shoulda got david car...  
1  upset cant updat facebook text might cri resul...  
2  user dive mani time ball manag save 50 rest go...  
3                    whole bodi feel itchi like fire  
4                      user notbehav im mad cant see  


In [11]:
df.to_csv("/content/drive/MyDrive/Colab Notebooks/twitter_cleaned_dataset.csv", index = False)

In [12]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/twitter_cleaned_dataset.csv", encoding='latin-1')
df.head()

Unnamed: 0,polarity,text,clean_text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",user url url that bummer shoulda got david car...
1,0,is upset that he can't update his Facebook by ...,upset cant updat facebook text might cri resul...
2,0,@Kenichan I dived many times for the ball. Man...,user dive mani time ball manag save 50 rest go...
3,0,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,"@nationwideclass no, it's not behaving at all....",user notbehav im mad cant see


In [13]:
df = df.dropna(subset=['clean_text'])
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 1599701 entries, 0 to 1599999
Data columns (total 3 columns):
 #   Column      Non-Null Count    Dtype 
---  ------      --------------    ----- 
 0   polarity    1599701 non-null  int64 
 1   text        1599701 non-null  object
 2   clean_text  1599701 non-null  object
dtypes: int64(1), object(2)
memory usage: 48.8+ MB
None


In [14]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['polarity'], test_size=0.2, random_state=42)

print("Train size:", len(X_train))
print("Test size:", len(X_test))

Train size: 1279760
Test size: 319941


In [15]:
# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=50000, ngram_range=(1,2), sublinear_tf=True)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
print("TF-IDF shape (train):", X_train_tfidf.shape)
print("TF-IDF shape (test):", X_test_tfidf.shape)

TF-IDF shape (train): (1279760, 50000)
TF-IDF shape (test): (319941, 50000)


## Model Training and Evaluation

In [16]:
# Bernoulli Naive Bayes
bnb = BernoulliNB()
bnb.fit(X_train_tfidf, y_train)
bnb_pred = bnb.predict(X_test_tfidf)
print("Bernoulli Naive Bayes Accuracy:", accuracy_score(y_test, bnb_pred))

Bernoulli Naive Bayes Accuracy: 0.7835132102481395


In [17]:
# Support Vector Machine (LinearSVC)
svm = LinearSVC(max_iter=5000, class_weight='balanced') # Increased max_iter and added class_weight
svm.fit(X_train_tfidf, y_train)
svm_pred = svm.predict(X_test_tfidf)
print("SVM Accuracy:", accuracy_score(y_test, svm_pred))

SVM Accuracy: 0.7998130905385681


In [18]:
# Logistic Regression
logreg = LogisticRegression(max_iter=5000, penalty = 'l2', C = 2, class_weight='balanced') # Increased max_iter and added class_weight
logreg.fit(X_train_tfidf, y_train)
logreg_pred = logreg.predict(X_test_tfidf)
print("Logistic Regression Accuracy:", accuracy_score(y_test, logreg_pred))

Logistic Regression Accuracy: 0.8022729190694534


In [19]:
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

# Random Forest Classifier
# Keeping similar parameters as before but adding class_weight
rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth = 15 , n_jobs=-1, class_weight='balanced')
rf.fit(X_train_tfidf, y_train)
rf_pred = rf.predict(X_test_tfidf)
print("Random Forest Accuracy:", accuracy_score(y_test, rf_pred))

Random Forest Accuracy: 0.7299439584173332


In [22]:
# XGBoost Classifier
# Using the scikit-learn API for ease of use with existing data
xg = xgb.XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=1, eval_metric='logloss', random_state=42, n_jobs=-1)

xg.fit(X_train_tfidf, y_train)
xg_pred = xg.predict(X_test_tfidf)
print("XGBoost Accuracy:", accuracy_score(y_test, xg_pred))

XGBoost Accuracy: 0.765778690446051


In [None]:
# Optional: Detailed classification report for each model
print("\nBernoulliNB Classification Report:\n", classification_report(y_test, bnb_pred))
print("\nSVM Classification Report:\n", classification_report(y_test, svm_pred))
print("\nLogistic Regression Classification Report:\n", classification_report(y_test, logreg_pred))
print("\nXGBoost Classification Report:\n", classification_report(y_test, xg_pred))
print("\nRandom Forest Classification Report:\n", classification_report(y_test, rf_pred))

## Making Predictions

In [None]:
# Test on custom tweets
sample_tweets = ["I love this!", "I hate that!", "It was okay, not great."]
sample_vec = vectorizer.transform(sample_tweets)

print("\nSample Predictions:")
print("BernoulliNB:", bnb.predict(sample_vec))
print("SVM:", svm.predict(sample_vec))
print("Logistic Regression:", logreg.predict(sample_vec))
print("Sample Predictions (Random Forest):", rf.predict(sample_vec))
print("Sample Predictions (XGBoost):", xg.predict(sample_vec))