In [16]:

# Essentials
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
%matplotlib inline
import os

# Import functions for data preprocessing & data preparation
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer

from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer, LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import string
from string import punctuation
import nltk
import re

In [32]:
data = pd.read_csv('Downloads/Ariana Grande - we cant be friends (wait for your love) (official music video) 3-10-2024.csv')
data.columns
data1=data.drop(['isReply','isHearted','isPinned','isPaid','paidAmount','isSponsor','sponsorshipMonths'],axis=1)
data1

Unnamed: 0,publishedTime,SimpleText,votes,author
0,10-03-2024,We are grateful to Ari and her crew for select...,465,@HalisRioe
1,10-03-2024,Here for this! X,1,@naomisegal5115
2,10-03-2024,HERE FOR EVAN!!,4,@Ireallydintknow
3,10-03-2024,Who is Evan? Thanks.,0,@SofiNme365
4,10-03-2024,Ari inspires me.. My parents said if i get 3K ...,1,@PoorWorldwithoutgod
...,...,...,...,...
508,10-03-2024,Robyn Dancing on my own 🤗,0,@user-jv8il3br9e
509,10-03-2024,Tu música es lo mejor que me ha pasado,0,@irammorales1354
510,10-03-2024,Evan Peters 😭😭😭,0,@adamautumn2329
511,10-03-2024,He is evan Peters 😭😭😭😭😭😭,0,@biruth30


In [33]:
print(data.columns)

Index(['publishedTime', 'SimpleText', 'votes', 'author', 'isReply',
       'isHearted', 'isPinned', 'isPaid', 'paidAmount', 'isSponsor',
       'sponsorshipMonths'],
      dtype='object')


In [34]:

nltk.download('vader_lexicon')
sentiments = SentimentIntensityAnalyzer()
data1["Positive"] = [sentiments.polarity_scores(i)["pos"] for i in data1["SimpleText"]]
data1["Negative"] = [sentiments.polarity_scores(i)["neg"] for i in data1["SimpleText"]]
data1["Neutral"] = [sentiments.polarity_scores(i)["neu"] for i in data1["SimpleText"]]
data1['Compound'] = [sentiments.polarity_scores(i)["compound"] for i in data1["SimpleText"]]
score = data1["Compound"].values
sentiment = []
for i in score:
    if i >= 0.05 :
        sentiment.append('Positive')
    elif i <= -0.05 :
        sentiment.append('Negative')
    else:
        sentiment.append('Neutral')
data1["Sentiment"] = sentiment
data1.head()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,publishedTime,SimpleText,votes,author,Positive,Negative,Neutral,Compound,Sentiment
0,10-03-2024,We are grateful to Ari and her crew for select...,465,@HalisRioe,0.244,0.0,0.756,0.6705,Positive
1,10-03-2024,Here for this! X,1,@naomisegal5115,0.0,0.0,1.0,0.0,Neutral
2,10-03-2024,HERE FOR EVAN!!,4,@Ireallydintknow,0.0,0.0,1.0,0.0,Neutral
3,10-03-2024,Who is Evan? Thanks.,0,@SofiNme365,0.492,0.0,0.508,0.4404,Positive
4,10-03-2024,Ari inspires me.. My parents said if i get 3K ...,1,@PoorWorldwithoutgod,0.206,0.0,0.794,0.6908,Positive


In [35]:
data2=data1.drop(['Positive','Negative','Neutral','Compound'],axis=1)
data2.head()

Unnamed: 0,publishedTime,SimpleText,votes,author,Sentiment
0,10-03-2024,We are grateful to Ari and her crew for select...,465,@HalisRioe,Positive
1,10-03-2024,Here for this! X,1,@naomisegal5115,Neutral
2,10-03-2024,HERE FOR EVAN!!,4,@Ireallydintknow,Neutral
3,10-03-2024,Who is Evan? Thanks.,0,@SofiNme365,Positive
4,10-03-2024,Ari inspires me.. My parents said if i get 3K ...,1,@PoorWorldwithoutgod,Positive


In [37]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [38]:
stop_words = stopwords.words('english')
porter_stemmer = PorterStemmer()
lancaster_stemmer = LancasterStemmer() 
snowball_stemer = SnowballStemmer(language="english")
lzr = WordNetLemmatizer()

In [39]:


def text_processing(text):   
    # convert text into lowercase
    text = text.lower()

    # remove new line characters in text
    text = re.sub(r'\n',' ', text)
    
    # remove punctuations from text
    text = re.sub('[%s]' % re.escape(punctuation), "", text)
    
    # remove references and hashtags from text
    text = re.sub("^a-zA-Z0-9$,.", "", text)
    
    # remove multiple spaces from text
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    
    # remove special characters from text
    text = re.sub(r'\W', ' ', text)

    text = ' '.join([word for word in word_tokenize(text) if word not in stop_words])
    
    # stemming using porter stemmer from nltk package - msh a7sn 7aga - momken: lancaster, snowball
    # text=' '.join([porter_stemmer.stem(word) for word in word_tokenize(text)])
    # text=' '.join([lancaster_stemmer.stem(word) for word in word_tokenize(text)])
    # text=' '.join([snowball_stemer.stem(word) for word in word_tokenize(text)])
    
    # lemmatizer using WordNetLemmatizer from nltk package
    text=' '.join([lzr.lemmatize(word) for word in word_tokenize(text)])

    return text

In [51]:
import nltk
nltk.download('punkt')
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...


True

In [52]:
data_copy = data2.copy()
data_copy.SimpleText = data_copy.SimpleText.apply(lambda text: text_processing(text))
     

le = LabelEncoder()
data_copy['Sentiment'] = le.fit_transform(data_copy['Sentiment'])
     

processed_data = {
    'Sentence':data_copy.SimpleText,
    'Sentiment':data_copy['Sentiment']
}

processed_data = pd.DataFrame(processed_data)
processed_data.head()

Unnamed: 0,Sentence,Sentiment
0,grateful ari crew selecting evan enabling foll...,2
1,x,1
2,evan,1
3,evan thanks,2
4,ari inspires parent said get 3k like vid theyd...,2


In [53]:

processed_data['Sentiment'].value_counts()

Sentiment
1    246
2    195
0     72
Name: count, dtype: int64

In [54]:

df_neutral = processed_data[(processed_data['Sentiment']==1)] 
df_negative = processed_data[(processed_data['Sentiment']==0)]
df_positive = processed_data[(processed_data['Sentiment']==2)]

# upsample minority classes
df_negative_upsampled = resample(df_negative, 
                                 replace=True,    
                                 n_samples= 205, 
                                 random_state=42)  

df_neutral_upsampled = resample(df_neutral, 
                                 replace=True,    
                                 n_samples= 205, 
                                 random_state=42)  


# Concatenate the upsampled dataframes with the neutral dataframe
final_data = pd.concat([df_negative_upsampled,df_neutral_upsampled,df_positive])
     

final_data['Sentiment'].value_counts()
     

Sentiment
0    205
1    205
2    195
Name: count, dtype: int64

In [55]:

corpus = []
for sentence in final_data['Sentence']:
    corpus.append(sentence)
corpus[0:5]

['hurt',
 'recordó eterno resplandor de una mente sin recuerdos',
 'first sentence made cry',
 'ari absolutely broke heart one dont know would ever recover incredibly sad beautiful time',
 'el resumen perfecto de eterno resplandor de una mente sin recuerdos con música de ariana grande']

In [56]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=1500)
X = cv.fit_transform(corpus).toarray()
y = final_data.iloc[:, -1].values
     

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
     

from sklearn.naive_bayes import GaussianNB
     

classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [57]:


from sklearn.metrics import confusion_matrix, accuracy_score
y_pred = classifier.predict(X_test)

In [58]:


cm = confusion_matrix(y_test, y_pred)
cm

array([[55,  1,  0],
       [ 5, 52,  7],
       [16,  6, 40]], dtype=int64)

In [60]:


nb_score = accuracy_score(y_test, y_pred)
nb_score
     

0.8076923076923077