In [43]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import re
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
import xgboost as xgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

### Data Ingestion

In [5]:
df = pd.read_csv('https://github.com/entbappy/Branching-tutorial/raw/refs/heads/master/tweet_emotions.csv')

In [6]:
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [7]:
df.drop(['tweet_id'],axis=1)

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...
...,...,...
39995,neutral,@JohnLloydTaylor
39996,love,Happy Mothers Day All my love
39997,love,Happy Mother's Day to all the mommies out ther...
39998,happiness,@niariley WASSUP BEAUTIFUL!!! FOLLOW ME!! PEE...


In [8]:
final_df = df[df['sentiment'].isin(['happiness','sadness'])] 

In [9]:
final_df.sample(5)

Unnamed: 0,tweet_id,sentiment,content
5876,1961081604,sadness,I miss my Jeep Now I have to bum a ride to th...
30984,1751785509,happiness,"A slice of pizza, a movie (Star Trek), and a s..."
5377,1960807693,happiness,"@bookaliciouspam Hey welcome to VA, LOL. It's..."
24196,1694790154,happiness,@paulina1 Good Morning and Have a Great Day at...
11775,1963347629,sadness,I'm at work....bored out of my mind.


In [10]:
final_df['sentiment'].value_counts()

sentiment
happiness    5209
sadness      5165
Name: count, dtype: int64

In [11]:
final_df['sentiment'] = final_df['sentiment'].replace({'happiness':1,'sadness':0})

  final_df['sentiment'] = final_df['sentiment'].replace({'happiness':1,'sadness':0})
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['sentiment'] = final_df['sentiment'].replace({'happiness':1,'sadness':0})


In [12]:
final_df


Unnamed: 0,tweet_id,sentiment,content
1,1956967666,0,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,0,Funeral ceremony...gloomy friday...
6,1956968487,0,"I should be sleep, but im not! thinking about ..."
8,1956969035,0,@charviray Charlene my love. I miss you
9,1956969172,0,@kelcouch I'm sorry at least it's Friday?
...,...,...,...
39986,1753905153,1,going to watch boy in the striped pj's hope i ...
39987,1753918809,1,"gave the bikes a thorough wash, degrease it an..."
39988,1753918818,1,"had SUCH and AMAZING time last night, McFly we..."
39994,1753918900,1,Succesfully following Tayla!!


### Data Preprocessing

In [13]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sahun\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sahun\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

In [15]:
def remove_stop_words(text):
    return ' '.join([word for word in text.split() if word not in stopwords.words('english')])


In [16]:
def removing_numbers(text):
    return''.join([word for word in text if not word.isdigit()])

In [17]:
def lower_case(text):
    return ' '.join([word.lower() for word in text.split()])

In [18]:
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

In [19]:
def remove_urls(text):
    return re.sub(r'http\S+', '', text)

In [20]:
def remove_small_sentences(text):
    for i in range(len(df)):
        if len(df['contnet'][i].split()) < 3:
            df.drop(i, inplace=True)

In [21]:
def normalize_text(df):
    df.content = df.content.apply(lower_case)
    df.content = df.content.apply(remove_stop_words)
    df.content = df.content.apply(removing_numbers)
    df.content = df.content.apply(remove_punctuation)
    df.content = df.content.apply(remove_urls)
    df.content = df.content.apply(lemmatization)
    return df


In [22]:
def normalize_sentence(sentence):
    sentence = lower_case(sentence)
    sentence = remove_stop_words(sentence)
    sentence = removing_numbers(sentence)
    sentence = remove_punctuation(sentence)
    sentence = remove_urls(sentence)
    sentence = lemmatization(sentence)
    return sentence

In [23]:
normalize_sentence("that's it? what the fuck are you guys foing?")

'thats it fuck guy foing'

In [25]:
final_df = normalize_text(final_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.content = df.content.apply(lower_case)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.content = df.content.apply(remove_stop_words)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.content = df.content.apply(removing_numbers)
A value is trying to be set on a copy of a slice from a DataFrame.

In [34]:
final_df.drop(['tweet_id'],axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.drop(['tweet_id'],axis=1, inplace=True)


In [35]:
final_df

Unnamed: 0,sentiment,content
1,0,layin n bed headache ughhhhwaitin call
2,0,funeral ceremonygloomy friday
6,0,sleep im not thinking old friend want married ...
8,0,charviray charlene love miss
9,0,kelcouch sorry least friday
...,...,...
39986,1,going watch boy striped pjs hope cry
39987,1,gave bike thorough wash degrease grease it thi...
39988,1,amazing time last night mcfly incredible
39994,1,succesfully following tayla


### Feature Engineering

In [36]:

y = final_df.iloc[:,:-1]
X = final_df.iloc[:,-1]


In [37]:
X

1                   layin n bed headache ughhhhwaitin call
2                            funeral ceremonygloomy friday
6        sleep im not thinking old friend want married ...
8                             charviray charlene love miss
9                              kelcouch sorry least friday
                               ...                        
39986                 going watch boy striped pjs hope cry
39987    gave bike thorough wash degrease grease it thi...
39988             amazing time last night mcfly incredible
39994                          succesfully following tayla
39998    niariley wassup beautiful follow me peep new h...
Name: content, Length: 10374, dtype: object

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [39]:
vectorizer = CountVectorizer()

X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [40]:
X_train_vectorized.shape, X_test_vectorized.shape

((8299, 15055), (2075, 15055))

In [41]:
train_df = pd.DataFrame(X_train_vectorized.toarray()) 
train_df  

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,15045,15046,15047,15048,15049,15050,15051,15052,15053,15054
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8294,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8295,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8296,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8297,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Model Building

In [44]:
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
xgb_model.fit(X_train_vectorized, y_train)

y_pred = xgb_model.predict(X_test_vectorized)

accuracy = accuracy_score(y_test, y_pred)
classification_report = classification_report(y_test, y_pred)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [47]:
print(f'Accuracy: {accuracy}')

print('Classification Report:\n', classification_report)

Accuracy: 0.7619277108433735
Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.69      0.75      1060
           1       0.72      0.84      0.78      1015

    accuracy                           0.76      2075
   macro avg       0.77      0.76      0.76      2075
weighted avg       0.77      0.76      0.76      2075

