In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# DATASET
df = pd.read_csv(r'C:\Users\nh013\Desktop\500k ChatGPT-related Tweets Jan-Mar 2023\Twitter Jan Mar.csv')

# CONVERT TO LOWERCASE
df['content'] = df['content'].str.lower()

# REMOVE URLS
df['content'] = df['content'].apply(lambda x: re.sub(r'http\S+', '', str(x)))

# REMOVE SPECIAL CHARACTERS
df['content'] = df['content'].apply(lambda x: re.sub(r'[^\w\s]', '', str(x)))

# TOKENIZATION
df['content'] = df['content'].apply(lambda x: word_tokenize(x))

# STOPWORD REMOVAL
stop_words = set(stopwords.words('english'))
df['content'] = df['content'].apply(lambda x: [word for word in x if word not in stop_words])

# LEMMATIZATION
lemmatizer = WordNetLemmatizer()
df['content'] = df['content'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# SANTIMENT ANALYSIS USING VADER
sia = SentimentIntensityAnalyzer()
df['sentiment_polarity'] = df['content'].apply(lambda x: sia.polarity_scores(' '.join(x))['compound'])

# CONVERT SENTIMENT SCORE TO CATEGORICAL LEBEL
df['sentiment_category'] = df['sentiment_polarity'].apply(lambda x: 'Positive' if x > 0 else 'Negative' if x < 0 else 'Neutral')

#SPLIT DATA
X = df['content'].apply(lambda x: ' '.join(x))
y = df['sentiment_category']

# COUNTVECTORIZER
vectorizer = CountVectorizer()
X_vectorized = vectorizer.fit_transform(X)

# MODEL
nb_classifier = MultinomialNB()

# TRAIN MODEL
nb_classifier.fit(X_vectorized, y)

# PREDICT SENTIMENT POLARITY
X_test = ['This is a positive tweet', 'I am not happy with the product']
X_test_vectorized = vectorizer.transform(X_test)
y_pred = nb_classifier.predict(X_test_vectorized)


for tweet, sentiment in zip(X_test, y_pred):
    print(f"Tweet: {tweet}")
    print(f"Sentiment: {sentiment}")
    print()


y_train_pred = nb_classifier.predict(X_vectorized)
accuracy = accuracy_score(y, y_train_pred)
print(f"Training Accuracy: {accuracy}")


Tweet: This is a positive tweet
Sentiment: Positive

Tweet: I am not happy with the product
Sentiment: Positive

Training Accuracy: 0.8305741986576967
