In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


from keras_preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.utils import to_categorical


from keras.models import Sequential
from keras.layers import Dense, Embedding,LSTM, SpatialDropout1D

import nltk
nltk.download("vader_lexicon")

from nltk.sentiment.vader import SentimentIntensityAnalyzer

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\znaya\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [2]:
tweets = pd.read_csv("Tweets2.csv")
tweets.head(3)

Unnamed: 0,id,local,sentiment,text
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...


In [3]:
tweets.sentiment.value_counts()

Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: sentiment, dtype: int64

In [4]:
tweets.loc[tweets['sentiment'] == 'Irrelevant', 'sentiment'] = 'Neutral'

In [5]:
tweets.sentiment.value_counts()

Neutral     31308
Negative    22542
Positive    20832
Name: sentiment, dtype: int64

In [6]:
tweets['text'].isnull().sum()

686

In [7]:
tweets.dropna(subset=['text'],inplace=True)

In [8]:
tweets['text'].isnull().sum()

0

### LTSM

In [9]:
tokenizer = Tokenizer(num_words=500)
tokenizer.fit_on_texts(tweets['text'])

X = tokenizer.texts_to_sequences(tweets['text'])

In [10]:
X = pad_sequences(X,maxlen=500,padding='post')

In [11]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(tweets['sentiment'])

y = to_categorical(y,num_classes=3)

In [12]:
model = Sequential()

model.add(Embedding(input_dim=len(tokenizer.word_index),output_dim=128,input_shape=(500,))) 
model.add(SpatialDropout1D(0.2))
model.add(LSTM(units=196,dropout=0.2,recurrent_dropout=0,activation='tanh',recurrent_activation='sigmoid',unroll=False,use_bias=True))
model.add(Dense(units=3,activation="softmax"))

  super().__init__(**kwargs)


In [13]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

None


In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [15]:
model.fit(x=X_train,
    y=y_train,
    batch_size=300,
    epochs=1,
    verbose=True)

[1m185/185[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m450s[0m 2s/step - accuracy: 0.4141 - loss: 1.0853


<keras.src.callbacks.history.History at 0x19569699eb0>

### Vader

In [26]:
vader = SentimentIntensityAnalyzer()

In [17]:
tweets['vader_sentiment'] = ''
tweets.head(3)

Unnamed: 0,id,local,sentiment,text,vader_sentiment
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,


In [104]:
for i in range(len(tweets)):
    sentiment = vader.polarity_scores(tweets.iloc[i,3])
    
    del sentiment['compound']
    
    tweets.iloc[i,4] = max(sentiment,key=sentiment.get)

In [110]:
tweets.vader_sentiment.value_counts()

neu    65581
pos     4755
neg     3660
Name: vader_sentiment, dtype: int64

In [111]:
#tweets.loc[tweets['vader_sentiment'] == 'neu','vader_sentiment'] = 'Neutral'
#tweets.loc[tweets['vader_sentiment'] == 'pos','vader_sentiment'] = 'Positive'
#tweets.loc[tweets['vader_sentiment'] == 'neg','vader_sentiment'] = 'Negative'

tweets['vader_sentiment'].replace('neu','Neutral',inplace=True)
tweets['vader_sentiment'].replace('pos','Positive',inplace=True)
tweets['vader_sentiment'].replace('neg','Negative',inplace=True)

#for i,z in zip(tweets.vader_sentiment.value_counts().items(),tweets.sentiment.value_counts().items()):
    #tweets['vader_sentiment'].replace(i[0],z[0],inplace=True)

In [113]:
tweets.vader_sentiment.value_counts()

Neutral     65581
Positive     4755
Negative     3660
Name: vader_sentiment, dtype: int64

In [114]:
tweets.head(3)

Unnamed: 0,id,local,sentiment,text,vader_sentiment
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...,Neutral
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...,Neutral
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...,Neutral


### Validate

In [117]:
from sklearn.metrics import confusion_matrix, accuracy_score

<b>Vader

In [118]:
cm1 = confusion_matrix(tweets.sentiment,tweets.vader_sentiment)
cm1

array([[ 2004, 19902,   452],
       [ 1122, 28384,  1477],
       [  534, 17295,  2826]], dtype=int64)

In [120]:
acc1 = accuracy_score(tweets.sentiment,tweets.vader_sentiment)
acc1

0.44886210065408944

<b>LTSM

In [16]:
_, acc = model.evaluate(X_test,y_test)
print(acc)

[1m579/579[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 128ms/step - accuracy: 0.4219 - loss: 1.0816
0.42261743545532227
