In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score, recall_score, roc_auc_score

In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/campusx-official/jupyter-masterclass/main/tweet_emotions.csv')
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [3]:
df.drop(columns=['tweet_id'], inplace=True)

In [4]:
final_df= df[df['sentiment'].isin(['happiness', 'sadness'])]

In [6]:
final_df.sample(5)

Unnamed: 0,sentiment,content
20546,happiness,morning! just got my coffee
36786,happiness,"@stevelensink thanks stephen, appreciate it"
37367,happiness,"Ps Brian just announced his message title, &qu..."
28080,happiness,Listening to Miley Cyrus Breakout CD ! love it!
33544,happiness,just watched Star Trek.. I liked it I'm makin...


In [7]:
final_df.shape

(10374, 2)

In [8]:
final_df.head()

Unnamed: 0,sentiment,content
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
6,sadness,"I should be sleep, but im not! thinking about ..."
8,sadness,@charviray Charlene my love. I miss you
9,sadness,@kelcouch I'm sorry at least it's Friday?


In [9]:
train_data, test_data = train_test_split(final_df, test_size=0.2, random_state=42)

In [10]:
nltk.download('wordnet')
nltk.download('stopwords')

def lemmatization(text):
  lemmatizer = WordNetLemmatizer()

  text = text.split()

  text = [lemmatizer.lemmatize(y) for y in text]

  return " ".join(text)

def remove_stopwords(text):
  stop_words = set(stopwords.words('english'))
  Text = [i for i in str(text).split() if i not in stop_words]
  return " ".join(Text)

def removing_numbers(text):
  text = ''.join([i for i in text if not i.isdigit()])
  return text

def lower_case(text):
  text = text.split()
  text = [y.lower() for y in text]

  return " ".join(text)

def removing_punctuations(text):
  ##Remove punctuations
  text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,،-./:;<=>؟?@[\]^_`{|}~"""), ' ', text)
  text = text.replace('؛',"", )

  ##remove extra white space
  text = re.sub('\s+', ' ',text)
  text = " ".join(text.split())

  return text.strip()

def remove_urls(text):
  url_pattern = re.compile(r'http?://\S+|www\.\S+')
  return url_pattern.sub(r'', text)


def remove_small_stentences(df):
  for i in range(len(df)):
    if len(df.text.iloc[i].split()) <3:
      df.text.iloc[i] = np.nan

def normalize_text(df):
  df.content = df.content.apply(lambda content: lower_case(content))
  df.content = df.content.apply(lambda content: remove_stopwords(content))
  df.content = df.content.apply(lambda content: removing_numbers(content))
  df.content = df.content.apply(lambda content: removing_punctuations(content))
  df.content = df.content.apply(lambda content: remove_urls(content))
  df.content = df.content.apply(lambda content: lemmatization(content))
  return df


def normalize_sentence(sentence):
  sentence = lower_case(sentence)
  sentence = remove_stopwords(sentence)
  sentence = removing_numbers(sentence)
  sentence = removing_punctuations(sentence)
  sentence = remove_urls(sentence)
  sentence = lemmatization(sentence)
  return sentence




[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [11]:
normalize_sentence("That's it? It's done already! This is one")

'that s it done already one'

In [12]:
train_data = normalize_text(train_data)
test_data = normalize_text(test_data)

In [13]:
train_data

Unnamed: 0,sentiment,content
23531,sadness,quot my problem miss you cause don t quot
8051,sadness,that s it done already one proof there s nothi...
11499,sadness,hungry food steal
31288,happiness,foot hurt finally bed will forget crunch over ...
18561,sadness,really ill atm
...,...,...
21697,happiness,chocolatesuze yes yes should especially wine m...
19445,sadness,kickzfadayz boy better get tonight
20216,happiness,tafe actually quite good
3258,sadness,minute boarding hour home window seat


In [14]:
X_train = train_data['content'].values
y_train = train_data['sentiment'].values

X_test = test_data['content'].values
y_test = test_data['sentiment'].values

In [16]:
##Apply Bag of words (CountVectorizer)
vectorizer = CountVectorizer()

#Fit the vectorizer on the training data and transform it
X_train_bow = vectorizer.fit_transform(X_train)

#Transform the test data using the same vectorizer
x_test_bow = vectorizer.transform(X_test)

In [18]:
train_df = pd.DataFrame(X_train_bow.toarray())

train_df['label'] = y_train
train_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14223,14224,14225,14226,14227,14228,14229,14230,14231,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,sadness
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,sadness
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,sadness
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,happiness
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,sadness


In [19]:
!pip install classification_report

Collecting classification_report
  Downloading classification_report-1.0.0.tar.gz (24 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sklearn (from classification_report)
  Downloading sklearn-0.0.post12.tar.gz (2.6 kB)
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py egg_info[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m See above for output.
  
  [1;35mnote[0m: This error originates from a subprocess, and is likely not a problem with pip.
  Preparing metadata (setup.py) ... [?25l[?25herror
[1;31merror[0m: [1mmetadata-generation-failed[0m

[31m×[0m Encountered error while generating package metadata.
[31m╰─>[0m See above for output.

[1;35mnote[0m: This is an issue with the package mentioned above, not pip.
[1;36mhint[0m: See above for details.


In [21]:
from sklearn.preprocessing import LabelEncoder

# Encode the string labels to numerical labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

#Define and train the XGBoost Model
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train_bow, y_train_encoded) # Use encoded labels for training

y_pred_encoded = xgb_model.predict(x_test_bow) # Predict encoded labels

# Decode the predicted labels back to string labels for evaluation
y_pred = label_encoder.inverse_transform(y_pred_encoded)

accuracy = accuracy_score(y_test, y_pred)
# The classification_report library seems to have an issue with installation,
# so I will use the classification_report from sklearn.metrics instead.
from sklearn.metrics import classification_report
classification_rep = classification_report(y_test, y_pred)

print("Accuracy:", accuracy)
print("Classification Report:\n", classification_rep)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.771566265060241
Classification Report:
               precision    recall  f1-score   support

   happiness       0.80      0.71      0.75      1015
     sadness       0.75      0.83      0.79      1060

    accuracy                           0.77      2075
   macro avg       0.77      0.77      0.77      2075
weighted avg       0.77      0.77      0.77      2075



In [26]:
# Make predictions
y_pred_encoded = xgb_model.predict(x_test_bow) # Predict encoded labels

# Decode the predicted labels back to string labels for evaluation
y_pred = label_encoder.inverse_transform(y_pred_encoded)

y_pred_proba = xgb_model.predict_proba(x_test_bow)[:, 1]

# Calculate evaluation metrics
precision = precision_score(y_test, y_pred, pos_label='happiness')
recall = recall_score(y_test, y_pred, pos_label='happiness')
auc = roc_auc_score(y_test_encoded, y_pred_proba) # Use encoded labels for AUC

print("Precision:", precision)
print("Recall:", recall)
print("AUC:", auc)

Precision: 0.7988950276243094
Recall: 0.7123152709359606
AUC: 0.8595775629705362
