In [1]:
import pandas as pd
import numpy as np

In [2]:
d2 = pd.read_csv("Airline-Sentiment-2-w-AA.csv", encoding='ISO-8859–1')
tweets = d2['text']
sentiment = d2['airline_sentiment']

In [3]:
import re
special_char = ['!', '#', '%', '"', '&', '*', '(', ')', ';', 'ûï', '\x89ûï:', '\x89û\x9d']

def process_tweet(tweet, special_char):
    for char in special_char:
        tweet = tweet.replace(char, "")
        
    urls = re.findall('http://\S+|https://\S+', tweet)
    airline_tags = re.findall('@\w+', tweet)
    
    for tag in airline_tags:
        tweet = tweet.replace(tag, "")
        
    for url in urls:
        tweet = tweet.replace(url, "")

        
    return tweet.lower()

In [4]:
procesed_tweets = []

for tweet in tweets:
    _tweet = process_tweet(tweet, special_char=special_char)
    procesed_tweets.append(_tweet)

In [6]:
processed_sentiment = []

for sent in sentiment:
    if sent == 'neutral':
        processed_sentiment.append('Pos')
    elif sent == 'positive':
        processed_sentiment.append('Pos')
    else:
        processed_sentiment.append('Neg')

In [7]:
df_2 = pd.DataFrame(columns = ["Review", "Sentiment"])

for i in range(len(procesed_tweets)):
    row = pd.Series({'Review' : procesed_tweets[i], 'Sentiment' : processed_sentiment[i]})
    df_2 = pd.concat([df_2, row.to_frame().T], ignore_index = True)

In [8]:
vocab = df_2['Review']

In [10]:
#import glove embeddings 
from tqdm import tqdm
embedding_vector = {}
f = open('glove.6B.200d.txt')
for line in tqdm(f):
    value = line.split(' ')
    word = value[0]
    coef = np.array(value[1:],dtype = 'float32')
    embedding_vector[word] = coef

400000it [00:07, 53140.96it/s]


In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [12]:
token = Tokenizer()
token.fit_on_texts(vocab)

In [13]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df_2, test_size=0.1, random_state=42)

In [14]:
x_train = list(train['Review'])
y_train = train['Sentiment']

In [15]:
vocab_size = len(token.word_index)+1
print(vocab_size)

13777


In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

seq = token.texts_to_sequences(x_train)
pad_seq = pad_sequences(seq, maxlen=1100, padding = 'post')

In [17]:
from tqdm import tqdm
embedding_matrix = np.zeros((vocab_size,200))
for word,i in tqdm(token.word_index.items()):
    embedding_value = embedding_vector.get(word)
    if embedding_value is not None:
        embedding_matrix[i] = embedding_value

100%|████████████████████████████████████| 13776/13776 [00:00<00:00, 151943.40it/s]


In [18]:
#convert positive-negative to 1-0
sentiment = {
    "Pos" : 0,
    "Neg" : 1
}

y_filtered_converted = []
for sent in y_train:
    y_filtered_converted.append(sentiment[sent])

In [19]:
y_filtered_converted = np.int64(y_filtered_converted)

In [22]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Dropout,Embedding,Bidirectional, Conv1D, Flatten, MaxPooling1D

In [23]:
from tensorflow import keras

In [25]:
model_1 = Sequential()
model_1.add(Embedding(vocab_size, 200, weights = [embedding_matrix], input_length = 1100, trainable = False))

model_1.add(Conv1D(256, 3, activation='tanh', padding = 'same'))
model_1.add(Conv1D(128, 3, activation='tanh', padding = 'same'))

model_1.add(MaxPooling1D(pool_size = 3, strides = 2, padding = 'same'))

model_1.add(Dropout(0.2))

model_1.add(Bidirectional(LSTM(256, return_sequences = True)))
model_1.add(Bidirectional(LSTM(256, return_sequences = True, kernel_regularizer = keras.regularizers.L2(1e-4))))
model_1.add(Bidirectional(LSTM(128)))

model_1.add(Dense(10, activation = 'relu'))

model_1.add(Dense(1,activation = 'sigmoid'))

model_1.compile(optimizer='adam',loss='binary_crossentropy',metrics = ['accuracy'])

In [26]:
model_1.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1100, 200)         2755400   
                                                                 
 conv1d_2 (Conv1D)           (None, 1100, 256)         153856    
                                                                 
 conv1d_3 (Conv1D)           (None, 1100, 128)         98432     
                                                                 
 max_pooling1d (MaxPooling1D  (None, 550, 128)         0         
 )                                                               
                                                                 
 dropout (Dropout)           (None, 550, 128)          0         
                                                                 
 bidirectional (Bidirectiona  (None, 550, 512)         788480    
 l)                                                   

In [27]:
history_1 = model_1.fit(pad_seq, 
                    y_filtered_converted,
                    batch_size=128, 
                    verbose=1, 
                    epochs=8,
                    validation_split=0.2)

2023-02-13 12:28:21.759245: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [30]:
model_2 = Sequential()
model_2.add(Embedding(vocab_size, 200, weights = [embedding_matrix], input_length = 1100, trainable = False))

model_2.add(Conv1D(256, 3, activation='tanh', padding = 'same'))
model_2.add(Conv1D(128, 3, activation='tanh', padding = 'same'))

model_2.add(MaxPooling1D(pool_size = 3, strides = 2, padding = 'same'))

model_2.add(Dropout(0.5))

model_2.add(Bidirectional(LSTM(256, return_sequences = True, kernel_regularizer = keras.regularizers.L2(1e-4))))
model_2.add(Bidirectional(LSTM(256, return_sequences = True, kernel_regularizer = keras.regularizers.L2(1e-4))))
model_2.add(Bidirectional(LSTM(128)))

model_2.add(Dense(10, activation = 'relu'))
            
model_2.add(Dense(1,activation = 'sigmoid'))

model_2.compile(optimizer='adam',loss='binary_crossentropy',metrics = ['accuracy'])

In [31]:
history_2 = model_2.fit(pad_seq, 
                    y_filtered_converted,
                    batch_size=128, 
                    verbose=1, 
                    epochs=8,
                    validation_split=0.2)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
