<a href="https://colab.research.google.com/github/nicoleolivetto/MMD_Final_Project/blob/main/Different_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

LSTM

In [None]:
import numpy as np
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize.toktok import ToktokTokenizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.callbacks import EarlyStopping

column_names = ['sentiment', 'review']

financial_data = pd.read_csv('all-data.csv', names=column_names, encoding='latin1')

financial_data.rename(columns={0: 'sentiment', 1: 'review'}, inplace=True)

financial_data = financial_data[financial_data['sentiment'] != 'neutral']

unique_sentiments = financial_data['sentiment'].unique()

print(unique_sentiments)

financial_data = financial_data[financial_data['sentiment'] != 'neutral']

financial_data=financial_data.sort_index()

financial_data = financial_data[['review', 'sentiment']]

print(financial_data.columns)


positive_count = (financial_data['sentiment'] == 'positive').sum()
negative_count = (financial_data['sentiment'] == 'negative').sum()

print("Number of positive reviews:", positive_count)
print("Number of negative reviews:", negative_count)


financial_data['review_length'] = financial_data['review'].apply(len)

average_length = financial_data['review_length'].mean()

print("Average length of reviews:", average_length)


import nltk
nltk.download('stopwords')

# Data preprocessing
tokenizer = ToktokTokenizer()
stopwords_list = set(stopwords.words('english'))

def preprocess_text(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Tokenize
    tokens = tokenizer.tokenize(text.lower())
    # Remove stopwords
    tokens = [token for token in tokens if token not in stopwords_list]
    return ' '.join(tokens)

financial_data['clean_review'] = financial_data['review'].apply(preprocess_text)

# Tokenization and padding
max_length = 100
vocab_size = 8000

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(financial_data['clean_review'])

sequences = tokenizer.texts_to_sequences(financial_data['clean_review'])
X = pad_sequences(sequences, maxlen=max_length)
y = np.array(financial_data['sentiment'].map({'positive': 1, 'negative': 0}))

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model definition
embedding_dim = 100

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(units=200))
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model training
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(X_train, y_train, epochs=5, batch_size=8, validation_split=0.2, callbacks=[early_stopping])

# Model evaluation
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')
y_pred_prob = model.predict(X_test)
y_pred = (y_pred_prob > 0.5).astype(int)

# Evaluation metrics
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


['negative' 'positive']
Index(['review', 'sentiment'], dtype='object')
Number of positive reviews: 1363
Number of negative reviews: 604
Average length of reviews: 132.61057447890187


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  text = BeautifulSoup(text, "html.parser").get_text()


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 0.45653823018074036, Test Accuracy: 0.7851099967956543
Accuracy: 0.7851099830795262
              precision    recall  f1-score   support

           0       0.67      0.65      0.66       191
           1       0.84      0.85      0.84       400

    accuracy                           0.79       591
   macro avg       0.75      0.75      0.75       591
weighted avg       0.78      0.79      0.78       591

[[124  67]
 [ 60 340]]


CNN

In [None]:
# CNN Model
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from keras.callbacks import EarlyStopping
from keras.layers import Reshape
from keras.layers import Conv1D, MaxPooling1D, Flatten

# Define the CNN model
model_cnn = Sequential()

# Embedding layer
model_cnn.add(Embedding(input_dim=vocab_size, output_dim=100, input_length=100))

# First convolutional layer
model_cnn.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model_cnn.add(MaxPooling1D(pool_size=2))

# Second convolutional layer
model_cnn.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model_cnn.add(MaxPooling1D(pool_size=2))

# Flatten the output of the convolutional layers
model_cnn.add(Flatten())

# Fully connected layer
model_cnn.add(Dense(units=64, activation='relu'))

# Dropout layer
model_cnn.add(Dropout(0.5))

# Output layer
model_cnn.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary
model_cnn.summary()


# Training CNN model
history_cnn = model_cnn.fit(X_train, y_train, epochs=8, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

# Evaluation CNN model
loss_cnn, accuracy_cnn = model_cnn.evaluate(X_test, y_test)
print(f'CNN Test Loss: {loss_cnn}, CNN Test Accuracy: {accuracy_cnn}')
y_pred_cnn_prob = model_cnn.predict(X_test)
y_pred_cnn = (y_pred_cnn_prob > 0.5).astype(int)

# Evaluation metrics for CNN
print("CNN Model Accuracy:", accuracy_score(y_test, y_pred_cnn))
print("CNN Model Classification Report:\n", classification_report(y_test, y_pred_cnn))
print("CNN Model Confusion Matrix:\n", confusion_matrix(y_test, y_pred_cnn))

Model: "sequential_25"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_25 (Embedding)    (None, 100, 100)          800000    
                                                                 
 conv1d_28 (Conv1D)          (None, 98, 128)           38528     
                                                                 
 max_pooling1d_28 (MaxPooli  (None, 49, 128)           0         
 ng1D)                                                           
                                                                 
 conv1d_29 (Conv1D)          (None, 47, 64)            24640     
                                                                 
 max_pooling1d_29 (MaxPooli  (None, 23, 64)            0         
 ng1D)                                                           
                                                                 
 flatten_14 (Flatten)        (None, 1472)            

LSTM-CNN

In [None]:
from keras.layers import LSTM, Bidirectional

# Define the LSTM-CNN model
model_lstm_cnn = Sequential()

# Embedding layer
model_lstm_cnn.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))

# LSTM layer
model_lstm_cnn.add(Bidirectional(LSTM(units=200, return_sequences=True)))

# Convolutional layers
model_lstm_cnn.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model_lstm_cnn.add(MaxPooling1D(pool_size=2))
model_lstm_cnn.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model_lstm_cnn.add(MaxPooling1D(pool_size=2))

# Flatten the output
model_lstm_cnn.add(Flatten())

# Fully connected layer
model_lstm_cnn.add(Dense(units=64, activation='relu'))

# Dropout layer
model_lstm_cnn.add(Dropout(0.5))

# Output layer
model_lstm_cnn.add(Dense(units=1, activation='sigmoid'))

# Compile the model
model_lstm_cnn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print model summary
model_lstm_cnn.summary()

# Training LSTM-CNN model
history_lstm_cnn = model_lstm_cnn.fit(X_train, y_train, epochs=6, batch_size=128, validation_split=0.2, callbacks=[early_stopping])

# Evaluation LSTM-CNN model
loss_lstm_cnn, accuracy_lstm_cnn = model_lstm_cnn.evaluate(X_test, y_test)
print(f'LSTM-CNN Test Loss: {loss_lstm_cnn}, LSTM-CNN Test Accuracy: {accuracy_lstm_cnn}')
y_pred_lstm_cnn_prob = model_lstm_cnn.predict(X_test)
y_pred_lstm_cnn = (y_pred_lstm_cnn_prob > 0.5).astype(int)

# Evaluation metrics for LSTM-CNN
print("LSTM-CNN Model Accuracy:", accuracy_score(y_test, y_pred_lstm_cnn))
print("LSTM-CNN Model Classification Report:\n", classification_report(y_test, y_pred_lstm_cnn))
print("LSTM-CNN Model Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lstm_cnn))


Model: "sequential_26"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_26 (Embedding)    (None, 100, 100)          800000    
                                                                 
 bidirectional_3 (Bidirecti  (None, 100, 400)          481600    
 onal)                                                           
                                                                 
 conv1d_30 (Conv1D)          (None, 98, 128)           153728    
                                                                 
 max_pooling1d_30 (MaxPooli  (None, 49, 128)           0         
 ng1D)                                                           
                                                                 
 conv1d_31 (Conv1D)          (None, 47, 64)            24640     
                                                                 
 max_pooling1d_31 (MaxPooli  (None, 23, 64)          