In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import re
import string

In [15]:
df = pd.read_csv(r'C:\Users\Pakistan\Downloads\all_tweets.csv', encoding='latin-1', header=None)
df.columns = ['sentiment', 'id', 'date', 'query', 'text']

print(f"Shape:{df.shape}")
print(f"\nAll Sentiment values:")
print(df['sentiment'].value_counts().sort_index())
print(df.head(10))

Shape:(6033, 5)

All Sentiment values:
sentiment
0.0     65
1.0     65
2.0     65
3.0     65
4.0     65
        ..
95.0    32
96.0    23
97.0    15
98.0    12
99.0     5
Name: count, Length: 100, dtype: int64
   sentiment         id                                               date  \
0        NaN  sentiment                                               text   
1        0.0   positive  RT @SchudioTv: Want to know more about #autism...   
2        1.0   negative  We blame ourselves and feel worse. Start with ...   
3        2.0   positive  RT @PsychiatristCNS: 130,000 patient years and...   
4        3.0    neutral  RT @SkypeTherapist: See a therapist online ove...   
5        4.0   positive  RT @PsychiatristCNS: 130,000 patient years and...   
6        5.0    neutral  The onset of the #pandemic &amp; #WFH has led ...   
7        6.0    neutral  #Climate change is concerning. \n\nThese can c...   
8        7.0   positive  130,000 patient years and the diagnostic stabi...   
9        8.

In [17]:
def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return ""

    text =text.lower()
    text = re.sub(r'https\S+|https\S+\www\.\S+', '', text)
    text = re.sub(r'^@\w+|#\w+', ' ', text)
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = ''.join(text.split())
    return text

In [22]:
df['cleaned_text'] = df['text'].apply(preprocess_text)

df['word_count'] = df['cleaned_text'].str.split().str.len()
df = df[df['word_count'] >= 3]

df = df[df['cleaned_text'].str.strip() != ""]

print(f"Shape: {df.shape}")
print(f"Sentiment distribution:\n{df['sentiment'].value_counts().sort_index()}")

min_sentiment = df['sentiment'].min()
max_sentiment = df['sentiment'].max()

print(f"\nMin sentiment: {min_sentiment}")
print(f"Max sentiment: {max_sentiment}")
df['sentiment'] = df['sentiment'].map({min_sentiment: 0, max_sentiment: 1})
df['sentiment'] = df['sentiment'].map({min_sentiment: 0, max_sentiment: 1})


print(df.shape)
print(f"Negative: {(df['sentiment'] == 0).sum()}")
print(f"Positive: {(df['sentiment'] == 1).sum()}")


print("\n5 Negative examples:")
for text in df[df['sentiment'] == 0]['cleaned_text'].head(5):
    print(f"  - {text}")

print("\n5 Positive examples:")
for text in df[df['sentiment'] == 1]['cleaned_text'].head(5):
    print(f"  - {text}")


if len(df) < 100:
    print("\nWARNING: Dataset too small! Need at least 50 samples.")
    print("Please check your CSV file or use a larger dataset.")
    print("\nData looks good! Proceeding with training...")


X_train, X_test, y_train, y_test = train_test_split(
        df['cleaned_text'],
        df['sentiment'],
        test_size=0.2,
        random_state=42,
        stratify=df['sentiment']
    )

print(f'\nTraining size: {len(X_train)}')
print(f'Test size: {len(X_test)}')

Shape: (0, 7)
Sentiment distribution:
Series([], Name: count, dtype: int64)

Min sentiment: nan
Max sentiment: nan
(0, 7)
Negative: 0
Positive: 0

5 Negative examples:

5 Positive examples:

Please check your CSV file or use a larger dataset.

Data looks good! Proceeding with training...


ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

In [8]:
Vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X_train_tfidf = Vectorizer.fit_transform(X_train)
X_test_tfidf = Vectorizer.transform(X_test)

print("TF-IDF shape (train):", X_train_tfidf.shape)
print("TF-IDF shape (test):", X_test_tfidf.shape)

TF-IDF shape (train): (208, 4)
TF-IDF shape (test): (52, 4)


In [9]:
bnb = BernoulliNB()
bnb.fit(X_train_tfidf, y_train)


bnb_pred = bnb.predict(X_test_tfidf)

print("\nPrediction distribution:")
print(pd.Series(bnb_pred).value_counts())

print("Accuracy:", accuracy_score(y_test, bnb_pred))
print("\nConfusion matrix:")
print(confusion_matrix(y_test, bnb_pred, labels=[0, 1]))

print('\nClassification Report:\n', classification_report(y_test, bnb_pred, target_names=['negative', 'positive'], zero_division=0, labels=[0, 1]))


Prediction distribution:
1    45
0     7
Name: count, dtype: int64
Accuracy: 0.4423076923076923

Confusion matrix:
[[ 2 24]
 [ 5 21]]

Classification Report:
               precision    recall  f1-score   support

    negative       0.29      0.08      0.12        26
    positive       0.47      0.81      0.59        26

    accuracy                           0.44        52
   macro avg       0.38      0.44      0.36        52
weighted avg       0.38      0.44      0.36        52



In [10]:
lr = LogisticRegression(max_iter=1000, random_state=42)
lr.fit(X_train_tfidf, y_train)

lr_pred = lr.predict(X_test_tfidf)

print(f"\nAccuracy: {accuracy_score(y_test, lr_pred):.4f}")

print('\nClassification Report:')
print(classification_report(
    y_test,
    lr_pred,
    target_names=['Negative', 'Positive']
))


svm = LinearSVC(max_iter=1000, random_state=42)

svm.fit(X_train_tfidf, y_train)

svm_pred = svm.predict(X_test_tfidf)

print(f"\nAccuracy: {accuracy_score(y_test, svm_pred):.4f}")

print('\nClassification Report:')
print(classification_report(
    y_test,
    svm_pred,
    labels=[0, 1],
    target_names=['Negative', 'Positive']
))


results = {
    'Bernoulli Naive Bayes': accuracy_score(y_test, bnb_pred),
    'Logistic Regression': accuracy_score(y_test, lr_pred),
    'Linear SVM': accuracy_score(y_test, svm_pred)
}

for model, acc in sorted(results.items(), key=lambda x: x[1], reverse=True):
    print(f"{model}: {acc:.4f}")



Accuracy: 0.4423

Classification Report:
              precision    recall  f1-score   support

    Negative       0.29      0.08      0.12        26
    Positive       0.47      0.81      0.59        26

    accuracy                           0.44        52
   macro avg       0.38      0.44      0.36        52
weighted avg       0.38      0.44      0.36        52


Accuracy: 0.4423

Classification Report:
              precision    recall  f1-score   support

    Negative       0.29      0.08      0.12        26
    Positive       0.47      0.81      0.59        26

    accuracy                           0.44        52
   macro avg       0.38      0.44      0.36        52
weighted avg       0.38      0.44      0.36        52

Bernoulli Naive Bayes: 0.4423
Logistic Regression: 0.4423
Linear SVM: 0.4423
