In [2]:
# Importing Essential Libraries

import numpy as np
import pandas as pd
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
import re, string, unicodedata
from string import punctuation

In [25]:
# Import data

data = pd.read_csv('/content/tweet_emotions.csv')

In [26]:
# Counting the Frequency of Sentiment Categories in the Dataset

data["sentiment"].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
neutral,8638
worry,8459
happiness,5209
sadness,5165
love,3842
surprise,2187
fun,1776
relief,1526
hate,1323
empty,827


In [27]:
#Identifying and Analyzing Missing Data in the Training Dataset

total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)


Unnamed: 0,Total,Percent
tweet_id,0,0.0
sentiment,0,0.0
content,0,0.0


In [28]:
# Converting Sentiment Labels to Categorical Data

data.sentiment = pd.Categorical(pd.factorize(data.sentiment)[0])

In [29]:
# Displaying the First Few Rows of the Training Dataset

data.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,0,@tiffanylue i know i was listenin to bad habi...
1,1956967666,1,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,1,Funeral ceremony...gloomy friday...
3,1956967789,2,wants to hang out with friends SOON!
4,1956968416,3,@dannycastillo We want to trade with someone w...


In [10]:
#Text Preprocessing Functions for URL Removal(Remove URLs from the text.),
# Special Character Cleaning(optionally retaining digits),
#Stemming (reduce words to their root form), and Final Text Cleaning (by removing stopwords and non-alphabetic words)

def hapus_url(text):
    return re.sub(r'http\S+','', text)
def remove_special_characters(text, remove_digits=True):
    pattern=r'[^a-zA-Z0-9\s]'
    text=re.sub(pattern,'',text)
    return text
def stemmer(text):
    ps=nltk.porter.PorterStemmer()
    text=' '.join([ps.stem(word) for word in text.split()])
    return text

def final_clean(text):
    final_text= []
    for i in text.split():
        if i.strip().lower() not in sw and i.strip().lower().isalpha():
            final_text.append(i.strip().lower())
    return " ".join(final_text)

In [11]:
# Comprehensive Text Cleaning Function

def clean(text):
    text = hapus_url(text)
    text = remove_special_characters(text, remove_digits=True)
    text = stemmer(text)
    text = final_clean(text)
    return text

In [30]:
from nltk.corpus import stopwords

# Ensure you have the NLTK stopwords corpus downloaded
import nltk
nltk.download('stopwords')

# Define 'sw' as a set of English stopwords
sw = set(stopwords.words('english'))

# Your functions can now use 'sw'
def final_clean(text):
    final_text = []
    for i in text.split():
        if i.strip().lower() not in sw and i.strip().lower().isalpha():
            final_text.append(i.strip().lower())
    return " ".join(final_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [31]:
# Applying Text Cleaning Function to Dataset Content Column

data['content'] = data['content'].apply(clean)

In [32]:
# Displaying DataFrame Columns

%matplotlib inline
data.columns

Index(['tweet_id', 'sentiment', 'content'], dtype='object')

In [33]:
# Removing the 'tweet_id' Column from the DataFrame

data.drop(['tweet_id'], axis=1, inplace=True)

In [17]:
# Importing TfidfVectorizer from scikit-learn for Text Feature Extraction

from sklearn.feature_extraction.text import TfidfVectorizer

In [34]:
# Initializing and Applying TfidfVectorizer to Transform Text Data

tfidf_vect = TfidfVectorizer()

text1 = tfidf_vect.fit_transform(data["content"])

In [35]:
# Extracting the 'sentiment' Column from the DataFrame

y = data["sentiment"]

In [20]:
# Splitting Data into Training and Test Sets Using train_test_split
 #(dividing the dataset into training and testing subsets)

from sklearn.model_selection import train_test_split

X_train, X_test,y_train,y_test = train_test_split(text1, y, test_size=0.3, random_state=123)

In [21]:
#Importing SVC and Classification Report Modules from scikit-learn
# the code is importing tools for classification and evaluating model performance

from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [22]:
# Displaying the Shape of the Training Data

print(X_train.shape)

(28000, 41297)


In [23]:
# Training an SVM Model with the Entire Dataset

model = SVC()
model.fit(text1,y)

In [24]:
# Making Predictions with the SVM Model and Evaluating Performance

pred_svm = model.predict(X_test)
print(classification_report(pred_svm, y_test))


              precision    recall  f1-score   support

           0       0.06      0.88      0.10        16
           1       0.81      0.89      0.85      1403
           2       0.01      1.00      0.03         3
           3       0.93      0.65      0.76      3740
           4       0.94      0.74      0.82      3204
           5       0.38      0.98      0.55       254
           6       0.77      0.82      0.79      1087
           7       0.41      0.98      0.58       221
           8       0.46      0.89      0.61       192
           9       0.85      0.78      0.82      1736
          10       0.09      1.00      0.16         4
          11       0.29      0.96      0.44       140
          12       0.00      0.00      0.00         0

    accuracy                           0.76     12000
   macro avg       0.46      0.81      0.50     12000
weighted avg       0.85      0.76      0.78     12000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


The classification report for the SVM model reveals the following key insights:

Overall Accuracy: The model achieves an overall accuracy of 76%, indicating a reasonably good performance in classifying the data.

Class-Specific Performance:

The model performs particularly well with classes 3, 4, and 9, showing high precision, recall, and f1-scores.
Classes 1, 6, and 7 also demonstrate strong performance, with high recall, which suggests that these classes are well-identified by the model.
However, there are notable weaknesses in other classes. For example, classes 0, 2, 10, and 12 show poor precision, recall, and f1-scores, indicating that these classes are either misclassified or underrepresented.
Macro and Weighted Averages:

The macro average f1-score is 0.50, reflecting the average performance across all classes without considering their support.
The weighted average f1-score of 0.78 takes into account the support of each class, showing better overall performance when considering the class distribution.
Recommendations:

To improve the model's performance, consider addressing the classes with lower precision and recall by exploring additional feature engineering, balancing class distributions, or tuning model parameters.
Further investigation into the misclassified samples may provide insights into why certain classes are not well-predicted and help refine the model.
Overall, while the model performs well on some classes, there is room for improvement, particularly for the classes with lower scores.