In [29]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split


import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Mohamed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Mohamed\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [22]:

def clean_tweet(tweet):
    # Remove URLs
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    
    # Remove mentions, hashtags, and special characters
    tweet = re.sub(r'\@\w+|\#\w+|\W', ' ', tweet)
    
    # Remove digits and convert to lowercase
    tweet = re.sub(r'\d+', '', tweet).lower()
    
    # Remove extra spaces
    tweet = re.sub(r'\s+', ' ', tweet).strip()

    # Tokenize the tweet
    tokens = nltk.word_tokenize(tweet)
    
    # Remove stopwords and lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    stopwords_set = set(stopwords.words('english'))
    tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords_set]
    
    # Join the cleaned tokens back into a string
    cleaned_tweet = ' '.join(tokens)
    
    return cleaned_tweet


In [33]:
def accuracy(y, y_hat):
    accuracy = np.sum(y == y_hat) / len(y)
    return accuracy

In [23]:
class MultinomialNaiveBayes:
    def __init__(self, alpha=1.0):
        self.alpha = alpha

    def fit(self, X, y):
        n_classes = len(np.unique(y))
        n_features = X.shape[1]

        self.classes, counts = np.unique(y, return_counts=True)
        self.class_probs = counts / len(y)

        self.feature_probs = np.zeros((n_classes, n_features))

        for i, c in enumerate(self.classes):
            X_c = X[y == c]
            self.feature_probs[i, :] = (X_c.sum(axis=0) + self.alpha) / (np.sum(X_c) + self.alpha * n_features)

    def predict(self, X):
        n_samples = X.shape[0]
        y_pred = np.zeros(n_samples)
        
        for i in range(n_samples):
            log_probs = np.log(self.class_probs) + np.sum(np.log(self.feature_probs) * X[i, :], axis=1)
            y_pred[i] = self.classes[np.argmax(log_probs)]
            
        return y_pred


In [24]:
df = pd.read_csv("Tweets.csv")
df = df[["text", "airline_sentiment"]]
# Define the mapping for sentiment values
sentiment_mapping = {'negative': -1, 'neutral': 0, 'positive': 1}
# Replace sentiment values with their corresponding numerical representation
df['airline_sentiment'] = df['airline_sentiment'].replace(sentiment_mapping)

df['text'] = df['text'].apply(clean_tweet)
df.head()

Unnamed: 0,text,airline_sentiment
0,said,0
1,plus added commercial experience tacky,1
2,today must mean need take another trip,0
3,really aggressive blast obnoxious entertainmen...,-1
4,really big bad thing,-1


In [30]:
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['airline_sentiment'], stratify=y,test_size = 0.3, random_state = 31)

In [31]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X_train).toarray()
y = y_train

In [32]:
model = MultinomialNaiveBayes()
model.fit(X, y)

In [40]:
y_hat = model.predict(X)
accuracy(y, y_hat)

0.8389929742388759

In [35]:
X_test = vectorizer.transform(X_test).toarray()
y_hat = model.predict(X_test)
accuracy(y_test, y_hat)

0.7622950819672131

In [39]:
test_corpus = [
    'This company is very bad and staff rude',
    'The best company that I know, I love it',
]

X_test = vectorizer.transform(test_corpus).toarray()
y_pred = model.predict(X_test)

print(y_pred)

[-1.  1.]
