In [32]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.decomposition import TruncatedSVD
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Load the dataset
df = pd.read_csv("amazon_reviews_features.txt", sep="\t")

# Preprocessing
# Combine REVIEW_TITLE and REVIEW_TEXT into a single column
df['REVIEW'] = df['REVIEW_TITLE'] + ' ' + df['REVIEW_TEXT']

# Function for text preprocessing
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Tokenize text
    tokens = nltk.word_tokenize(text)
    # Remove special characters and punctuation
    tokens = [re.sub(r'[^a-zA-Z0-9\s]', '', token) for token in tokens]
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    return ' '.join(tokens)

# Apply preprocessing to REVIEW_TEXT column
df['REVIEW_TEXT'] = df['REVIEW_TEXT'].apply(preprocess_text)

# Feature extraction
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=10000)
tfidf_matrix = tfidf_vectorizer.fit_transform(df['REVIEW'])
tfidf_weights = tfidf_matrix.toarray()

# Select top 2000 n-grams based on their tf-idf scores
total_tfidf_scores = np.sum(tfidf_weights, axis=0)
top_indices = total_tfidf_scores.argsort()[-2000:][::-1]
top_ngrams = [tfidf_vectorizer.get_feature_names_out()[i] for i in top_indices]

# Dimensionality reduction for n-grams
X_ngrams_selected = SelectKBest(score_func=f_classif, k=2000).fit_transform(tfidf_weights, df['TARGET'])
X_ngrams_lsa = TruncatedSVD(n_components=1500, random_state=42).fit_transform(X_ngrams_selected)



# Word Embeddings
content_review_tokens = [text.split() for text in df['REVIEW']]
skipgram_model = Word2Vec(sentences=content_review_tokens, vector_size=100, window=5, min_count=1, sg=1)

# Average Word Embedding
avg_embedding_reviews = []
for tokens in content_review_tokens:
    embeddings = [skipgram_model.wv[word] for word in tokens if word in skipgram_model.wv]
    avg_embedding = np.mean(embeddings, axis=0) if embeddings else np.zeros(100)
    avg_embedding_reviews.append(avg_embedding)

# Emotion Features
emotion_X = df[['VERIFIED_PURCHASE', 'OPI_FIN_POS', 'OPI_FIN_NEG', 'BL_POS', 'BL_NEG', 'AFINN_POS',
                'AFINN_NEG', 'S140_POS', 'S140_NEG', 'SWN_POS', 'SWN_NEG',
                'NRC_HASH_POS', 'NRC_HASH_NEG', 'EMOTICON_POS', 'EMOTICON_NEG',
                'NRC_ANGER', 'NRC_ANTICIPATION', 'NRC_DISGUST', 'NRC_FEAR', 'NRC_JOY',
                'NRC_SADNESS', 'NRC_SURPRISE', 'NRC_TRUST', 'NRC_EXP_ANGER',
                'NRC_EXP_ANTICIPATION', 'NRC_EXP_DISGUST', 'NRC_EXP_FEAR',
                'NRC_EXP_JOY', 'NRC_EXP_SADNESS', 'NRC_EXP_SURPRISE', 'NRC_EXP_TRUST']].values.tolist()

# Combine features
X = np.concatenate((X_ngrams_lsa, avg_embedding_reviews, emotion_X), axis=1)

# Train-test split
# X_train = np.concatenate((X[:8400], X[10500:18900]))
# X_test = np.concatenate((X[8400:10500], X[18900:]))
# y_train = np.concatenate((df['TARGET'].values[:8400], df['TARGET'].values[10500:18900]))
# y_test = np.concatenate((df['TARGET'].values[8400:10500], df['TARGET'].values[18900:]))

X_train, X_test, y_train, y_test = train_test_split(X, df['TARGET'].values, test_size=0.23, random_state=42)


# Define the DFFNN model
model = Sequential()
model.add(Dense(200, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.20))
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(50, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, epochs=15, batch_size=64, validation_data=(X_test, y_test))
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

Epoch 1/15


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.5290 - loss: 0.8066 - val_accuracy: 0.6828 - val_loss: 0.6343
Epoch 2/15
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7122 - loss: 0.5877 - val_accuracy: 0.7950 - val_loss: 0.4881
Epoch 3/15
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7898 - loss: 0.4979 - val_accuracy: 0.8039 - val_loss: 0.4604
Epoch 4/15
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8008 - loss: 0.4742 - val_accuracy: 0.7706 - val_loss: 0.4951
Epoch 5/15
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8118 - loss: 0.4565 - val_accuracy: 0.7905 - val_loss: 0.4686
Epoch 6/15
[1m253/253[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.8084 - loss: 0.4584 - val_accuracy: 0.8106 - val_loss: 0.4474
Epoch 7/15
[1m253/253[0m [32m━━━━━━━