In [7]:
import numpy as np 
import pandas as pd 
import seaborn as sns 
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.metrics import roc_curve, auc, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from mlxtend.plotting import plot_confusion_matrix
import matplotlib.cm as cm
from matplotlib import rcParams
from collections import Counter
from nltk.tokenize import RegexpTokenizer
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout, Input, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import MaxPooling1D

# Download NLTK resources
nltk.download('stopwords')
nltk.download('wordnet', "/kaggle/working/nltk_data/")
nltk.download('omw-1.4', "/kaggle/working/nltk_data/")
! unzip /kaggle/working/nltk_data/corpora/wordnet.zip -d /kaggle/working/nltk_data/corpora
! unzip /kaggle/working/nltk_data/corpora/omw-1.4.zip -d /kaggle/working/nltk_data/corpora
nltk.data.path.append("/kaggle/working/nltk_data/")

# Load dataset
dataa = pd.read_csv("/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv", encoding="ISO-8859-1", engine="python",
                   names=["label", "time", "date", "query", "username", "text"])


[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /kaggle/working/nltk_data/...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /kaggle/working/nltk_data/...
Archive:  /kaggle/working/nltk_data/corpora/wordnet.zip
   creating: /kaggle/working/nltk_data/corpora/wordnet/
  inflating: /kaggle/working/nltk_data/corpora/wordnet/lexnames  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/data.verb  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/index.adv  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/adv.exc  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/index.verb  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/cntlist.rev  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/data.adj  
  inflating: /kaggle/working/nltk_data/corpora/wordnet/index.adj  
  inflating: /kaggle/working/nltk_data/corpora/word

In [10]:
#ASSIGNING 1 TO POSITIVE SENTIMENT 4
data=dataa
data.loc[data['label'] == 4, 'label'] = 1
# Filter the original dataset to separate positive and negative tweets
data_pos = data[data['label'] == 1]
data_neg = data[data['label'] == 0]

# Take only the first 10,000 tweets
data_pos = data_pos.head(10000)
data_neg = data_neg.head(10000)

# Concatenate positive and negative tweets
data = pd.concat([data_pos, data_neg])

In [13]:
data.head

<bound method NDFrame.head of         label        time                          date     query  \
800000      1  1467822272  Mon Apr 06 22:22:45 PDT 2009  NO_QUERY   
800001      1  1467822273  Mon Apr 06 22:22:45 PDT 2009  NO_QUERY   
800002      1  1467822283  Mon Apr 06 22:22:46 PDT 2009  NO_QUERY   
800003      1  1467822287  Mon Apr 06 22:22:46 PDT 2009  NO_QUERY   
800004      1  1467822293  Mon Apr 06 22:22:46 PDT 2009  NO_QUERY   
...       ...         ...                           ...       ...   
9995        0  1550729779  Sat Apr 18 07:05:12 PDT 2009  NO_QUERY   
9996        0  1550730633  Sat Apr 18 07:05:23 PDT 2009  NO_QUERY   
9997        0  1550731192  Sat Apr 18 07:05:29 PDT 2009  NO_QUERY   
9998        0  1550731281  Sat Apr 18 07:05:30 PDT 2009  NO_QUERY   
9999        0  1550731500  Sat Apr 18 07:05:32 PDT 2009  NO_QUERY   

             username                                               text  
800000          ersle       I LOVE @Health4UandPets u guys r the b

In [14]:

# Preprocessing
def preprocess_text(text):
    text = text.lower()  # Convert text to lowercase
    text = re.sub('@[^\s]+', ' ', text)  # Remove emails
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))', ' ', text)  # Remove URLs
    text = re.sub('[0-9]+', '', text)  # Remove numbers
    text = re.sub(r'(.)\1+', r'\1', text)  # Remove repeating characters
    translator = str.maketrans('', '', string.punctuation)  # Remove punctuations
    text = text.translate(translator)
    return text

stopwords_list = set(stopwords.words('english'))
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stopwords_list])

stemmer = PorterStemmer()
def stem_text(text):
    return [stemmer.stem(word) for word in text]

lemmatizer = WordNetLemmatizer()
def lemmatize_text(text):
    return [lemmatizer.lemmatize(word) for word in text]

tokenizer = RegexpTokenizer(r'\w+')

data['text'] = data['text'].apply(preprocess_text)
data['text'] = data['text'].apply(remove_stopwords)
data['text'] = data['text'].apply(tokenizer.tokenize)
data['text'] = data['text'].apply(stem_text)
data['text'] = data['text'].apply(lemmatize_text)

# Prepare data for model input
X = data['text']
y = data['label']

max_len = 500
tok = Tokenizer(num_words=2000)
tok.fit_on_texts(X)
sequences = tok.texts_to_sequences(X)
sequences_matrix = pad_sequences(sequences, maxlen=max_len)


In [16]:

# Splitting data
X_train, X_test, Y_train, Y_test = train_test_split(sequences_matrix, y, test_size=0.3, random_state=2)
# Define CNN Model 1
def cnn_model_1(max_len=500): 
    inputs = Input(name='inputs', shape=[max_len])
    layer = Embedding(2000, 50, input_length=max_len)(inputs)
    layer = Conv1D(64, 3, activation='relu')(layer)
    layer = GlobalMaxPooling1D()(layer)
    layer = Dense(128, activation='relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1, activation='sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])
    return model

model_1 = cnn_model_1()
history_1 = model_1.fit(X_train, Y_train, batch_size=80, epochs=6, validation_split=0.1)

accr_1 = model_1.evaluate(X_test, Y_test)
print('CNN Model 1 - Test set\n  Accuracy: {:0.2f}'.format(accr_1[1]))


Epoch 1/6
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m54s[0m 309ms/step - accuracy: 0.5668 - loss: 0.6779 - val_accuracy: 0.6936 - val_loss: 0.5797
Epoch 2/6
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 268ms/step - accuracy: 0.7534 - loss: 0.5183 - val_accuracy: 0.7336 - val_loss: 0.5436
Epoch 3/6
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 263ms/step - accuracy: 0.7903 - loss: 0.4630 - val_accuracy: 0.7286 - val_loss: 0.5489
Epoch 4/6
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 257ms/step - accuracy: 0.8169 - loss: 0.4210 - val_accuracy: 0.7179 - val_loss: 0.5580
Epoch 5/6
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 250ms/step - accuracy: 0.8413 - loss: 0.3817 - val_accuracy: 0.7143 - val_loss: 0.5725
Epoch 6/6
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 228ms/step - accuracy: 0.8750 - loss: 0.3255 - val_accuracy: 0.7107 - val_loss: 0.5999
[1m188/188[0m 

In [18]:

# CNN Model 2
def cnn_model_2(max_len=500): 
    inputs = Input(name='inputs', shape=[max_len])
    layer = Embedding(2000, 50, input_length=max_len)(inputs)
    layer = Conv1D(128, 5, activation='relu')(layer)
    layer = MaxPooling1D(2)(layer)
    layer = Conv1D(64, 5, activation='relu')(layer)
    layer = GlobalMaxPooling1D()(layer)
    layer = Dense(256, activation='relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1, activation='sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])
    return model


model_2 = cnn_model_2()
history_2 = model_2.fit(X_train, Y_train, batch_size=80, epochs=6, validation_split=0.1)
accr_2 = model_2.evaluate(X_test, Y_test)
print('CNN Model 2 - Test set\n  Accuracy: {:0.2f}'.format(accr_2[1]))


Epoch 1/6
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 290ms/step - accuracy: 0.5199 - loss: 0.6912 - val_accuracy: 0.6500 - val_loss: 0.6186
Epoch 2/6
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 140ms/step - accuracy: 0.7257 - loss: 0.5516 - val_accuracy: 0.7371 - val_loss: 0.5387
Epoch 3/6
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 192ms/step - accuracy: 0.7886 - loss: 0.4623 - val_accuracy: 0.7093 - val_loss: 0.5605
Epoch 4/6
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 134ms/step - accuracy: 0.8170 - loss: 0.4171 - val_accuracy: 0.7036 - val_loss: 0.6196
Epoch 5/6
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 95ms/step - accuracy: 0.8504 - loss: 0.3537 - val_accuracy: 0.7029 - val_loss: 0.6088
Epoch 6/6
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 71ms/step - accuracy: 0.8957 - loss: 0.2715 - val_accuracy: 0.6979 - val_loss: 0.7040
[1m188/188[0m [

In [19]:
# CNN Model 3
def cnn_model_3(max_len=500): 
    inputs = Input(name='inputs', shape=[max_len])
    layer = Embedding(2000, 50, input_length=max_len)(inputs)
    layer = Conv1D(64, 3, activation='relu')(layer)
    layer = MaxPooling1D(2)(layer)
    layer = Conv1D(64, 3, activation='relu')(layer)
    layer = GlobalMaxPooling1D()(layer)
    layer = Dense(128, activation='relu')(layer)
    layer = Dropout(0.5)(layer)
    layer = Dense(1, activation='sigmoid')(layer)
    model = Model(inputs=inputs, outputs=layer)
    model.compile(loss='binary_crossentropy', optimizer=RMSprop(), metrics=['accuracy'])
    return model


model_3 = cnn_model_3()
history_3 = model_3.fit(X_train, Y_train, batch_size=80, epochs=6, validation_split=0.1)
accr_3 = model_3.evaluate(X_test, Y_test)
print('CNN Model 3 - Test set\n  Accuracy: {:0.2f}'.format(accr_3[1]))


Epoch 1/6
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 321ms/step - accuracy: 0.5409 - loss: 0.6863 - val_accuracy: 0.7050 - val_loss: 0.5850
Epoch 2/6
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 259ms/step - accuracy: 0.7371 - loss: 0.5393 - val_accuracy: 0.7286 - val_loss: 0.5402
Epoch 3/6
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 247ms/step - accuracy: 0.7789 - loss: 0.4759 - val_accuracy: 0.7300 - val_loss: 0.5419
Epoch 4/6
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 225ms/step - accuracy: 0.8125 - loss: 0.4241 - val_accuracy: 0.7100 - val_loss: 0.5964
Epoch 5/6
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 209ms/step - accuracy: 0.8452 - loss: 0.3760 - val_accuracy: 0.7071 - val_loss: 0.5966
Epoch 6/6
[1m158/158[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 185ms/step - accuracy: 0.8635 - loss: 0.3293 - val_accuracy: 0.7071 - val_loss: 0.6632
[1m188/188[0m 