In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
import zipfile

In [4]:
import pandas as pd
import os

# Verify if file exists and check size
file_path = 'training.1600000.processed.noemoticon.csv'
if os.path.exists(file_path):
    file_size = os.path.getsize(file_path)
    print(f"File exists. Size: {file_size/1024/1024:.2f} MB")
    # Expected size should be around 80MB for the full dataset
else:
    print("File not found. Downloading...")
    !wget http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
    !unzip trainingandtestdata.zip

File not found. Downloading...
--2025-07-11 16:37:48--  http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip [following]
--2025-07-11 16:37:48--  https://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 81363704 (78M) [application/zip]
Saving to: ‘trainingandtestdata.zip’


2025-07-11 16:37:54 (15.7 MB/s) - ‘trainingandtestdata.zip’ saved [81363704/81363704]

Archive:  trainingandtestdata.zip
  inflating: testdata.manual.2009.06.14.csv  
  inflating: training.1600000.processed.noemoticon.csv  


In [5]:
import pandas as pd
import os

# Verify if file exists and check size
file_path = 'training.1600000.processed.noemoticon.csv'
if os.path.exists(file_path):
    file_size = os.path.getsize(file_path)
    print(f"File exists. Size: {file_size/1024/1024:.2f} MB")
    # Expected size should be around 80MB for the full dataset
else:
    print("File not found. Downloading...")
    !wget http://cs.stanford.edu/people/alecmgo/trainingandtestdata.zip
    !unzip trainingandtestdata.zip

File exists. Size: 227.74 MB


In [6]:
import csv

data = []
with open('training.1600000.processed.noemoticon.csv', 'r', encoding='latin-1') as f:
    reader = csv.reader(f)
    for i, row in enumerate(reader):
        try:
            if len(row) == 6:  # Verify correct number of columns
                data.append(row)
            if i % 100000 == 0:  # Progress tracking
                print(f"Processed {i} rows...")
        except csv.Error:
            continue  # Skip malformed rows

# Convert to DataFrame
columns = ['target', 'ids', 'date', 'flag', 'user', 'text']
df = pd.DataFrame(data, columns=columns)
print(f"\nSuccessfully loaded {len(df)} tweets")

Processed 0 rows...
Processed 100000 rows...
Processed 200000 rows...
Processed 300000 rows...
Processed 400000 rows...
Processed 500000 rows...
Processed 600000 rows...
Processed 700000 rows...
Processed 800000 rows...
Processed 900000 rows...
Processed 1000000 rows...
Processed 1100000 rows...
Processed 1200000 rows...
Processed 1300000 rows...
Processed 1400000 rows...
Processed 1500000 rows...

Successfully loaded 1600000 tweets


In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import re


In [8]:
def safe_convert(x):
    try:
        return int(float(x))  # Handle both string numbers and floats
    except (ValueError, TypeError):
        return None  # Or use 1 for neutral as default

df['label'] = df['target'].apply(safe_convert)

# Remove any remaining nulls
df = df.dropna(subset=['label'])
df['label'] = df['label'].astype(int)

print("Final label distribution:")
print(df['label'].value_counts())

Final label distribution:
label
0    800000
4    800000
Name: count, dtype: int64


In [9]:
def preprocess(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|@\w+|#\w+', '', text)  # Remove URLs/mentions
    text = re.sub(r'[^a-z\s]', ' ', text)  # Keep only letters and spaces
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

df['clean_text'] = df['text'].apply(preprocess)


In [10]:
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'],
    df['label'],
    test_size=0.2,
    random_state=42,
    stratify=df['label']
)

In [11]:
tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1,1)  # Fixed: Added missing parenthesis
)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [12]:
model = MultinomialNB(alpha=0.5)
model.fit(X_train_tfidf, y_train)

In [13]:
y_pred = model.predict(X_test_tfidf)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['negative', 'positive']))


Classification Report:
              precision    recall  f1-score   support

    negative       0.75      0.76      0.75    160000
    positive       0.75      0.75      0.75    160000

    accuracy                           0.75    320000
   macro avg       0.75      0.75      0.75    320000
weighted avg       0.75      0.75      0.75    320000



In [14]:
def predict_sentiment(text):
    try:
        cleaned = preprocess(text)
        if not cleaned.strip():  # Handle empty text after preprocessing
            return {"text": text, "error": "Text too short after preprocessing"}

        vectorized = tfidf.transform([cleaned])

        # Get prediction with safety checks
        if hasattr(model, 'predict'):
            prediction = model.predict(vectorized)[0]
            proba = model.predict_proba(vectorized)[0] if hasattr(model, 'predict_proba') else [0, 0]
        else:
            return {"text": text, "error": "Model not properly trained"}

        # Handle unexpected predictions
        if prediction not in [0, 1]:
            return {
                "text": text,
                "warning": f"Unexpected prediction value: {prediction}",
                "probabilities": {
                    "negative": f"{proba[0]*100:.1f}%",
                    "positive": f"{proba[1]*100:.1f}%"
                }
            }

        return {
            "text": text,
            "prediction": ['negative', 'positive'][prediction],
            "confidence": {
                "negative": f"{proba[0]*100:.1f}%",
                "positive": f"{proba[1]*100:.1f}%"
            }
        }

    except Exception as e:
        return {"text": text, "error": str(e)}

# Test cases
test_texts = [
    "I love this product! It's amazing.",  # Should be positive
    "This is terrible and awful",          # Should be negative
    "",                                    # Empty string
    "123 456",                             # Numbers only
    "Very good"                            # Simple positive
]

print("=== Sentiment Analysis Tests ===")
for text in test_texts:
    result = predict_sentiment(text)
    print(f"\nText: '{text}'")
    print(result)

=== Sentiment Analysis Tests ===

Text: 'I love this product! It's amazing.'

Text: 'This is terrible and awful'
{'text': 'This is terrible and awful', 'prediction': 'negative', 'confidence': {'negative': '94.3%', 'positive': '5.7%'}}

Text: ''
{'text': '', 'error': 'Text too short after preprocessing'}

Text: '123 456'
{'text': '123 456', 'error': 'Text too short after preprocessing'}

Text: 'Very good'
