In [2]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

In [4]:
# Load dataset
file_path = '/mnt/data/twitter.csv'
df = pd.read_csv("twitter.csv")
df

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation
...,...,...,...
31957,31958,0,ate @user isz that youuu?ðððððð...
31958,31959,0,to see nina turner on the airwaves trying to...
31959,31960,0,listening to sad songs on a monday morning otw...
31960,31961,1,"@user #sikh #temple vandalised in in #calgary,..."


In [6]:
# Rename columns for convenience
df = df[['label', 'tweet']]

In [7]:
custom_stopwords = set([
    'a', 'an', 'and', 'are', 'as', 'at', 'be', 'by', 'for', 'from',
    'has', 'he', 'in', 'is', 'it', 'its', 'of', 'on', 'that', 'the',
    'to', 'was', 'were', 'will', 'with', 'this', 'i', 'you', 'your', 'we', 'they'
])


In [8]:
def tokenize(text):
    return re.findall(r'\b[a-z]{2,}\b', text.lower())

# Text preprocessing
def preprocess(text):
    text = re.sub(r"http\S+", "", text)         # Remove URLs
    text = re.sub(r"@\w+", "", text)            # Remove mentions
    text = re.sub(r"[^a-zA-Z ]", "", text)       # Remove special chars & numbers
    tokens = tokenize(text)
    tokens = [w for w in tokens if w not in custom_stopwords]
    return ' '.join(tokens)


In [9]:
df['cleaned'] = df['tweet'].apply(preprocess)

# Vectorization
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['cleaned'])
y = df['label']


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train model
model = MultinomialNB()
model.fit(X_train, y_train)
# Predict and evaluate
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9432191459408729
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      5937
           1       0.61      0.57      0.59       456

    accuracy                           0.94      6393
   macro avg       0.79      0.77      0.78      6393
weighted avg       0.94      0.94      0.94      6393



In [12]:
# Function to predict sentiment
def predict_sentiment(text):
    cleaned = preprocess(text)
    vector = vectorizer.transform([cleaned])
    return model.predict(vector)[0]

# Example prediction
example = "I love how easy this app is to use!"
print("Predicted Sentiment:", predict_sentiment(example))

Predicted Sentiment: 0
