In [1]:
import pandas as pd

<h2>Data Collection</h2>

In [2]:
def load_dataset(file_path):
    return pd.read_csv(file_path, encoding='latin1')


In [3]:
df = load_dataset('../dataset/TwitterDataset.csv')
df.head()

Unnamed: 0,File Name,Caption,LABEL
0,1.txt,How I feel today #legday #jelly #aching #gym,negative
1,10.txt,@ArrivaTW absolute disgrace two carriages from...,negative
2,100.txt,This is my Valentine's from 1 of my nephews. I...,positive
3,1000.txt,betterfeelingfilms: RT via Instagram: First da...,neutral
4,1001.txt,Zoe's first love #Rattled @JohnnyHarper15,positive


In [4]:
# Remove unnecessary column
df.drop(columns=df.columns[0], inplace=True)
df

Unnamed: 0,Caption,LABEL
0,How I feel today #legday #jelly #aching #gym,negative
1,@ArrivaTW absolute disgrace two carriages from...,negative
2,This is my Valentine's from 1 of my nephews. I...,positive
3,betterfeelingfilms: RT via Instagram: First da...,neutral
4,Zoe's first love #Rattled @JohnnyHarper15,positive
...,...,...
4864,OMG. Well done #Eskom! 'Man dies during #LoadS...,positive
4865,Feelin' the love in here! #ValentinesDay #caring,positive
4866,#blue #eyes can't be #beaten,neutral
4867,LA CHUCHA LOUUU TE CHUPO LOS OJOS..!,neutral


In [5]:
# Renaming the 'Caption' to 'tweet'
df.rename(columns={'Caption': 'tweet'}, inplace=True)
df.rename(columns={'LABEL': 'sentiment'}, inplace=True)

label_mapping = {'negative': 0, 'neutral': 2, 'positive': 4}
df['sentiment'] = df['sentiment'].map(label_mapping)
df.head()

Unnamed: 0,tweet,sentiment
0,How I feel today #legday #jelly #aching #gym,0
1,@ArrivaTW absolute disgrace two carriages from...,0
2,This is my Valentine's from 1 of my nephews. I...,4
3,betterfeelingfilms: RT via Instagram: First da...,2
4,Zoe's first love #Rattled @JohnnyHarper15,4


<h2>Pre Processing Data</h2>

* Cleansing Data: Removing @

In [6]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
# Removing @ from the tweet
def remove_usernames(text):
    return re.sub(r'@\w+', '', text)

def remove_hashtags(text):
    return re.sub(r'#', '', text)

def clean_text(text):
    # Removing URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Removing special characters and numbers
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', '', text)
    # Removing extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def tokenize_text(text):
    return word_tokenize(text)

def case_folding(text):
    return text.lower()

def remove_stopwords(tokens):
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

def stem_tokens(tokens):
    stemmer = PorterStemmer()
    return [stemmer.stem(word) for word in tokens]

def preprocess_text(text):
    text = remove_usernames(text)
    text = remove_hashtags(text)
    text = clean_text(text)
    text = case_folding(text)
    tokens = tokenize_text(text)
    tokens = remove_stopwords(tokens)
    tokens = stem_tokens(tokens)
    return ' '.join(tokens)

In [8]:
df['cleaned_tweets'] = df['tweet'].apply(preprocess_text)

In [9]:
df.head()

Unnamed: 0,tweet,sentiment,cleaned_tweets
0,How I feel today #legday #jelly #aching #gym,0,feel today legday jelli ach gym
1,@ArrivaTW absolute disgrace two carriages from...,0,absolut disgrac two carriag bangor half way st...
2,This is my Valentine's from 1 of my nephews. I...,4,valentin nephew elat sometim littl thing bigge...
3,betterfeelingfilms: RT via Instagram: First da...,2,betterfeelingfilm rt via instagram first day f...
4,Zoe's first love #Rattled @JohnnyHarper15,4,zoe first love rattl


In [10]:
# Only print data whose sentiment is neutral, I only need cleaned_tweets and sentiment columns
print(df[df['sentiment'] == 2][['cleaned_tweets', 'sentiment']])

                                         cleaned_tweets  sentiment
3     betterfeelingfilm rt via instagram first day f...          2
8     animalabus toronto puppi tortur offer k reward...          2
15    today stepbackintim betterthedevilyouknow what...          2
16    photo photograph got rumbl jungl beaten stone ...          2
20                                      sat plane shook          2
...                                                 ...        ...
4849  get friday night look sort newin lbd littlebla...          2
4859  scare veryscar hold owt hot come nearer david ...          2
4860  complet uniqu petrifi palm ear set sterl silve...          2
4866                                    blue eye beaten          2
4867                    la chucha louuu te chupo lo ojo          2

[1771 rows x 2 columns]


<h2>Feature Extraction</h2>

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.pipeline import Pipeline
from sklearn import metrics



In [12]:
X = df['cleaned_tweets']
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [29]:
# test_clf = Pipeline([
#     ('vect', CountVectorizer()),
#     ('tfidf', TfidfTransformer()),
#     ('clf', MultinomialNB()),
# ])

# test_clf.fit(X_train, y_train)

# # Predict on the test Data
# y_pred = test_clf.predict(X_test)

# #  Evaluate the model
# print(metrics.classification_report(y_test, y_pred))
# print(metrics.confusion_matrix(y_test, y_pred))
# print( "Accuracy" ,metrics.accuracy_score(y_test, y_pred))

In [13]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf  = vectorizer.transform(X_test)

In [14]:
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

In [15]:
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)

In [16]:
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.6724845995893224
              precision    recall  f1-score   support

           0       0.74      0.62      0.67       284
           2       0.63      0.60      0.61       367
           4       0.67      0.80      0.73       323

    accuracy                           0.67       974
   macro avg       0.68      0.67      0.67       974
weighted avg       0.68      0.67      0.67       974



In [21]:
# Make predictions on new data
# new_tweets = "I love this!", "This is awful.", "It's okay, not great.", "I hate this.", "Okay, Not Bad Not Good", "Just doing.", "this "]
# new_tweets_cleaned = [preprocess_text(tweet) for tweet in new_tweets]

# ? try
new_tweets = "I hate you but i love you"
new_tweets_cleaned = [preprocess_text(new_tweets)]

new_tweets_tfidf = vectorizer.transform(new_tweets_cleaned)
print(new_tweets_tfidf)
predictions = model.predict(new_tweets_tfidf)
predictions_proba = model.predict_proba(new_tweets_tfidf)

print(predictions_proba)
print(predictions)

  (0, 4691)	0.5096364514306784
  (0, 3541)	0.8603898461587898
[[0.4021164 0.1836156 0.414268 ]]
[4]


<h2>Save the Model</h2>

In [35]:
import joblib
# joblib.dump(model, 'model.pkl')

In [36]:
joblib.dump(vectorizer, 'vectorizer.pkl')

['vectorizer.pkl']

In [37]:
# Make predictions on new data
# new_tweets = "I love this!", "This is awful.", "It's okay, not great.", "I hate this.", "Okay, Not Bad Not Good", "Just doing.", "this "]
# new_tweets_cleaned = [preprocess_text(tweet) for tweet in new_tweets]

# ? try

model1 = joblib.load('model.pkl')
vectorizer1 = joblib.load('vectorizer.pkl')

new_tweets = "I normal This"


def predict(new_tweets):
    new_tweets_cleaned = [preprocess_text(new_tweets)]
    print("tweet cleaned",new_tweets_cleaned)

    new_tweets_tfidf = vectorizer1.transform(new_tweets_cleaned)
    print(new_tweets_tfidf)
    predictions = model1.predict(new_tweets_tfidf)
    print(predictions)
    
predict(new_tweets)
    

tweet cleaned ['normal']
  (0, 5493)	1.0
[2]


# FOR SVM

In [44]:
from sklearn.svm import SVC

In [45]:
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf  = vectorizer.transform(X_test)

In [46]:
model = SVC(probability=True)
model.fit(X_train_tfidf, y_train)

In [47]:
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)

In [48]:
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.7032854209445585
              precision    recall  f1-score   support

           0       0.73      0.67      0.70       284
           2       0.63      0.74      0.68       367
           4       0.80      0.69      0.74       323

    accuracy                           0.70       974
   macro avg       0.72      0.70      0.71       974
weighted avg       0.71      0.70      0.70       974



In [53]:
# Make predictions on new data
# new_tweets = "I love this!", "This is awful.", "It's okay, not great.", "I hate this.", "Okay, Not Bad Not Good", "Just doing.", "this "]
# new_tweets_cleaned = [preprocess_text(tweet) for tweet in new_tweets]

# ? try
new_tweets = "normal"
new_tweets_cleaned = [preprocess_text(new_tweets)]

new_tweets_tfidf = vectorizer.transform(new_tweets_cleaned)
print(new_tweets_tfidf)
predictions = model.predict(new_tweets_tfidf)
predictions_proba = model.predict_proba(new_tweets_tfidf)
print(predictions_proba)
print(predictions)

  (0, 5493)	1.0
[[0.18124779 0.52127656 0.29747566]]
[2]


# Saving the SVM Model

In [32]:
import joblib
joblib.dump(model, 'svm.pkl')

['svm.pkl']

### Hard Coded Naive Bayes

In [33]:
# %run CustomNaiveBayes.py

In [34]:
import numpy as np
from scipy.sparse import csr_matrix

class CustomNaiveBayes:
    def __init__(self):
        self.class_probs = {}
        self.feature_probs = {}
        self.classes = None
        self.vocab = None

    def fit(self, X, y):
        """
        Fit the Naive Bayes classifier according to the training data.

        Parameters:
        X: sparse matrix, shape (n_samples, n_features)
           Training data
        y: array-like, shape (n_samples,)
           Target labels
        """
        self.classes, class_counts = np.unique(y, return_counts=True)
        self.vocab = np.arange(X.shape[1])  # Feature indices
        
        # Initialize probabilities
        self.feature_probs = {c: np.zeros(len(self.vocab)) for c in self.classes}
        self.class_probs = {c: count / len(y) for c, count in zip(self.classes, class_counts)}

        # Count feature occurrences for each class
        for c in self.classes:
            class_data = X[y == c]
            word_counts = np.array(class_data.sum(axis=0)).flatten()
            self.feature_probs[c] = (word_counts + 1) / (word_counts.sum() + len(self.vocab))  # Add-one smoothing

    def predict(self, X):
        """
        Perform classification on an array of test vectors X.

        Parameters:
        X: sparse matrix, shape (n_samples, n_features)
           Test data

        Returns:
        array, shape (n_samples,)
            Predicted target values
        """
        predictions = []
        for doc in X:
            log_probs = {c: np.log(prob) for c, prob in self.class_probs.items()}
            word_counts = np.array(doc.sum(axis=0)).flatten()
            for c in self.classes:
                log_probs[c] += np.sum(np.log(self.feature_probs[c]) * word_counts)
            predictions.append(max(log_probs, key=log_probs.get))
        return np.array(predictions)


In [35]:
model =  CustomNaiveBayes()
model.fit(X_train_tfidf, y_train)


In [36]:
y_pred = model.predict(X_test_tfidf)
accuracy = accuracy_score(y_test, y_pred)

In [37]:
print(f'Accuracy: {accuracy}')
print(classification_report(y_test, y_pred))

Accuracy: 0.6724845995893224
              precision    recall  f1-score   support

           0       0.74      0.62      0.67       284
           2       0.63      0.60      0.61       367
           4       0.67      0.80      0.73       323

    accuracy                           0.67       974
   macro avg       0.68      0.67      0.67       974
weighted avg       0.68      0.67      0.67       974



### Save the model

In [38]:
import joblib
joblib.dump(model, 'custom_naive_bayes.pkl')

['custom_naive_bayes.pkl']

In [39]:
model = joblib.load('custom_naive_bayes.pkl')


In [40]:
# Make predictions on new data
# new_tweets = "I love this!", "This is awful.", "It's okay, not great.", "I hate this.", "Okay, Not Bad Not Good", "Just doing.", "this "]
# new_tweets_cleaned = [preprocess_text(tweet) for tweet in new_tweets]

# ? try
new_tweets = "I Love This, I hate this, normal this"
new_tweets_cleaned = [preprocess_text(new_tweets)]

new_tweets_tfidf = vectorizer.transform(new_tweets_cleaned)
# print(new_tweets_tfidf)
predictions = model.predict(new_tweets_tfidf)
print(predictions)

[4]


In [41]:
joblib.dump(model, "custom.pkl")

['custom.pkl']