In [1]:
#import libraries
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

#Load dataset
file_path = '/content/twitter_sentiment_data.csv'
data = pd.read_csv(file_path)
#data = pd.read_excel(file_path)

In [2]:
#Download NLTK stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) #set of common stop words in english

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
#pre-process text
def preprocess_text(text):
  # lower case text
  text = str(text).lower()
  # remove non-alphabet or punctuation
  text = ''.join([char if char.isalpha() or char.isspace() else '' for char in text])
  # remove stop words -- filler words
  text = ''.join([word for word in text.split() if word not in stop_words])
  return text

In [4]:
data['cleaned_message'] = data['message'].apply(preprocess_text)
print(data['cleaned_message'])

0        tiniebeanyclimatechangeinterestinghustleglobal...
1        rtnatgeochannelwatchbeforethefloodrightleodica...
2        fabulousleonardodicapriosfilmclimatechangebril...
3        rtmickfanningwatchedamazingdocumentaryleonardo...
4        rtcnalivepranitabiswasilutheranodishagivestest...
                               ...                        
43938    dearrealdonaldtrumpyeahrighthumanmediatedclima...
43939    respectivepartiespreventclimatechangegloballyz...
43940    rtmikkilunpollshowsclimatechangelowestglobalco...
43941    rttaehbeingextrastillcanqtbelievegiftaehyungsa...
43942    likeabatzachhallerwealthyfossilfuelindustrykno...
Name: cleaned_message, Length: 43943, dtype: object


In [5]:
# convert text into numerical data
vectorizer = CountVectorizer(max_features = 5000)
X = vectorizer.fit_transform(data['cleaned_message']).toarray() # transform cleaned txt to feature vectors
y = data['sentiment'] # target variable (sentiment labels)

In [6]:
# train our model
# split testing and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [7]:
# Build our model -- Logistic Regression
model = LogisticRegression(max_iter = 1000)
model.fit(X_train, y_train)

In [9]:
# Evaluate our model
y_pred = model.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report: \n", classification_report(y_test, y_pred))

Accuracy: 0.530208214813972
Classification Report: 
               precision    recall  f1-score   support

          -1       1.00      0.04      0.08       784
           0       0.98      0.03      0.06      1582
           1       0.52      1.00      0.69      4514
           2       1.00      0.03      0.07      1909

    accuracy                           0.53      8789
   macro avg       0.88      0.28      0.22      8789
weighted avg       0.75      0.53      0.38      8789



In [10]:

# Testing with a custom tweet
custom_tweet = "The product is amazing and I absolutely love it!"  # Example custom input
custom_cleaned = preprocess_text(custom_tweet)  # Preprocess the custom tweet
custom_vectorized = vectorizer.transform([custom_cleaned]).toarray()  # Transform the text into feature vectors
prediction = model.predict(custom_vectorized)  # Predict sentiment for the custom tweet
print("Sentiment:", "Positive" if prediction[0] > 0 else "Negative")  # Output the sentiment


Sentiment: Positive
