In [1]:
import pandas as pd
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from unidecode import unidecode
from nltk import ngrams
import numpy as np
import torch

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# The reviews are labelled as fake or real (in the dataset they’re mapped fake (label1) or real (label2)).
# https://medium.com/@lievgarcia/deception-on-amazon-c1e30d977cfd

df = pd.read_csv("amazon_reviews_features.txt", sep = "\t")   
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21000 entries, 0 to 20999
Data columns (total 39 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   DOC_ID                21000 non-null  int64  
 1   RATING                21000 non-null  int64  
 2   VERIFIED_PURCHASE     21000 non-null  int64  
 3   PRODUCT_CATEGORY      21000 non-null  object 
 4   PRODUCT_ID            21000 non-null  object 
 5   PRODUCT_TITLE         21000 non-null  object 
 6   REVIEW_TITLE          21000 non-null  object 
 7   REVIEW_TEXT           21000 non-null  object 
 8   OPI_FIN_POS           21000 non-null  int64  
 9   OPI_FIN_NEG           21000 non-null  int64  
 10  BL_POS                21000 non-null  int64  
 11  BL_NEG                21000 non-null  int64  
 12  AFINN_POS             21000 non-null  float64
 13  AFINN_NEG             21000 non-null  float64
 14  S140_POS              21000 non-null  float64
 15  S140_NEG           

In [3]:
num_fake = len(df[df['TARGET'] == 0])
num_real = len(df[df['TARGET'] == 1])

print(num_real, num_fake)

10500 10500


In [4]:
tokenizer = RegexpTokenizer(r'\w+')

# converting to lowercase and tokenizing
review_tokens = [tokenizer.tokenize(review.lower()) for review in df['REVIEW_TEXT']]

#removing special characters
review_tokens = [[unidecode(token) for token in review if token.isalnum()] for review in review_tokens]
" ".join(review_tokens[0])

'when least you think so this product will save the day just keep it around just in case you need it for something'

In [5]:
stop_words = set(stopwords.words("english"))
content_review_tokens = [[token for token in review if token not in stop_words and token.isalnum()] for review in review_tokens]

print("Before stop word removal: ", review_tokens[6914])
print()
print("After stop word removal: ", content_review_tokens[6914])

Before stop word removal:  ['love', 'the', 'bottle', 'very', 'much', 'br', 'br', 'iVm', 'a', 'tea', 'lover', 'when', 'i', 'saw', 'this', 'bottle', 'i', 'knew', 'that', 'it', 'was', 'what', 'i', 'wanted', 'the', 'shape', 'is', 'fantastic', 'feels', 'nice', 'in', 'your', 'hand', 'perfect', 'size', 'to', 'have', 'in', 'my', 'car', 'i', 'took', 'it', 'all', 'around', 'so', 'i', 'can', 'enjoy', 'my', 'tea', 'everywhere', 'love', 'it', 'very', 'much', 'the', 'one', 'with', 'infuser', 'also', 'looks', 'good']

After stop word removal:  ['love', 'bottle', 'much', 'br', 'br', 'iVm', 'tea', 'lover', 'saw', 'bottle', 'knew', 'wanted', 'shape', 'fantastic', 'feels', 'nice', 'hand', 'perfect', 'size', 'car', 'took', 'around', 'enjoy', 'tea', 'everywhere', 'love', 'much', 'one', 'infuser', 'also', 'looks', 'good']


In [6]:
#feature 1 of the dffnn model, top 2000 n grams according to their tfidf weights

from sklearn.feature_extraction.text import TfidfVectorizer

# Convert tokenized reviews back to strings
cleaned_reviews = [" ".join(review) for review in content_review_tokens]

# TF-IDF Vectorization
# Step 1: Tokenize into unigrams, bigrams, and trigrams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=10000)
tfidf_matrix = tfidf_vectorizer.fit_transform(cleaned_reviews)


# Step 2: Calculate tf-idf weights for each n-gram
tfidf_weights = tfidf_matrix.toarray()  # Convert the sparse matrix to dense array
feature_names = tfidf_vectorizer.get_feature_names_out()  # Get the feature names (n-grams)

# Now you can access the TF-IDF weights for each n-gram
# for i, review in enumerate(cleaned_reviews):
#     print(f"TF-IDF weights for review {i+1}:")
#     for j, feature in enumerate(feature_names):
#         weight = tfidf_weights[i][j]
#         if weight > 0:  # To print only non-zero weights
#             print(f"{feature}: {weight}")
#     print("\n")


# Step 3: Select top 2000 n-grams based on their tf-idf scores
total_tfidf_scores = np.sum(tfidf_weights, axis=0)  # Sum TF-IDF scores across all documents
top_indices = total_tfidf_scores.argsort()[-2000:][::-1]  # Get indices of top 2000 scores in descending order

# Get the top 2000 n-grams and their corresponding TF-IDF scores
top_ngrams = [feature_names[i] for i in top_indices]
top_tfidf_scores = [total_tfidf_scores[i] for i in top_indices]

# Print the top 2000 n-grams and their TF-IDF scores
for i, (ngram, score) in enumerate(zip(top_ngrams, top_tfidf_scores), start=1):
    print(f"{i}. {ngram}: {score}")


1. br: 683.5375098455908
2. great: 485.67385779350764
3. one: 405.3027978587271
4. good: 404.02605479916946
5. like: 379.3174868725596
6. product: 339.8697354279675
7. love: 328.7302710917243
8. really: 328.21438431499655
9. use: 319.57330483986317
10. well: 316.9574173948692
11. would: 311.9556270334749
12. quality: 282.7200529018454
13. br br: 264.85096957259157
14. get: 263.62894814916916
15. time: 258.1832341503777
16. price: 256.3732679193606
17. easy: 240.8427188899182
18. bought: 239.4415912565312
19. nice: 230.6021592528332
20. much: 226.78623763812828
21. little: 220.82492781588758
22. 34: 213.82089672639142
23. also: 212.64020052500717
24. works: 207.00762473970457
25. recommend: 204.99685522715518
26. got: 204.78086642991764
27. work: 200.53023332237706
28. buy: 198.64007161997776
29. perfect: 189.91220569507732
30. used: 186.71283986113448
31. even: 183.0775828762537
32. made: 181.12588697644114
33. watch: 173.43014255462052
34. looks: 170.94993808906568
35. better: 168.570

In [7]:
# Matrix creation for 21000 reviews for top 2000 ngrams

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

def review_to_vector(review, top_ngrams, top_tfidf_scores):
    review_vector = np.zeros(len(top_ngrams))  # Initialize a vector for the review

    for ngram, weight in zip(top_ngrams, top_tfidf_scores):
        if ngram in review:  # Check if the n-gram is present in the review
            index = top_ngrams.index(ngram)  # Get the index of the n-gram in the top_ngrams list
            review_vector[index] = weight  # Assign the TF-IDF weight to the corresponding index in the review vector

    return review_vector

# Example: Convert each review to a vector representation
review_vectors = []
for review in cleaned_reviews:
    review_vector = review_to_vector(review, top_ngrams, top_tfidf_scores)
    review_vectors.append(review_vector)

# Convert the list of review vectors to a numpy array
X = np.array(review_vectors)


In [8]:
# Reducing Dimensionality from 2000 to 200 per review

y = df['TARGET']

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.decomposition import TruncatedSVD

# Step 1: Correlation-based Feature Selection (CBFS)
def correlation_based_feature_selection(X, y, k):
    selector = SelectKBest(score_func=f_classif, k=k)
    X_selected = selector.fit_transform(X, y)
    return X_selected

# Step 2: Latent Semantic Analysis (LSA)
def latent_semantic_analysis(X, n_components):
    lsa = TruncatedSVD(n_components=n_components, random_state=42)
    X_lsa = lsa.fit_transform(X)
    return X_lsa

# Feature selection and dimensionality reduction for ngrams
X_ngrams_selected = correlation_based_feature_selection(X, y, 500)
X_ngrams_lsa = latent_semantic_analysis(X_ngrams_selected, 200)



In [9]:
# Assuming the Skip-Gram model is trained and saved as 'skipgram_word_embeddings.model'
from gensim.models import Word2Vec

# Assuming your preprocessed tokens are in 'content_review_tokens'
skipgram_model = Word2Vec(sentences=content_review_tokens,
                          vector_size=100,
                          window=5,
                          min_count=1,
                          sg=1,
                          hs=1)  # Set hs=1 for hierarchical softmax (optional)
#skipgram_model.save('skipgram_word_embeddings.model')

In [10]:
# Function to calculate average word embedding for a review
def average_word_embedding(review, skipgram_model):
  num_words = 0  # Count of valid words in the review

  review_vec = np.zeros(skipgram_model.vector_size)  # Zero vector to accumulate word embeddings

  for token in review:
    try:
      # Get the word embedding vector from the model
      word_vec = skipgram_model.wv[token]
      review_vec += word_vec  # Add the word vector to the accumulator
      num_words += 1
    except KeyError:
      # Skip words not found in the vocabulary (might be rare or out-of-vocabulary words)
      pass

# If no valid words were found, return zero vector
  if num_words == 0:
    return review_vec
  else:
    # Calculate the average word embedding
    return review_vec / num_words

# Example usage: Calculate average embedding for a sample review
sample_review = content_review_tokens[0]  # Assuming content_review_tokens contains tokenized reviews
average_embedding = average_word_embedding(sample_review, skipgram_model)
print(average_embedding)
print(len(average_embedding))

[-2.90523779e-01  1.54650580e-01 -9.76506634e-02 -2.13377456e-01
  1.75063904e-02 -6.72445112e-01  1.95814374e-01  2.38144498e-01
 -2.89663270e-01 -7.90794063e-02 -3.14902101e-02 -1.88296980e-01
  5.81149017e-02  1.24337274e-04  1.01875778e-01 -1.90002713e-02
 -1.12879723e-03 -1.05905152e-01 -2.09001345e-01 -2.52717230e-01
  1.77169455e-01  1.01015895e-01  1.66407630e-01 -3.35626319e-01
 -2.00059181e-01  1.32989754e-01 -1.67204043e-02 -7.58111831e-02
  9.01567725e-02  7.05897044e-03  1.66275623e-01 -1.25502475e-02
 -3.82355637e-02  2.15169001e-01 -1.89317435e-01  1.81748097e-01
 -1.56431706e-01 -2.30430182e-01  2.34109683e-02 -1.65619089e-01
  6.48584090e-02  1.66447975e-02  4.80037928e-02 -1.54126184e-01
  1.60932942e-01 -8.93037766e-04 -1.87151933e-01 -4.33409186e-02
 -5.17969139e-02 -1.32722144e-01  2.47817545e-01 -1.28830378e-01
 -9.97908765e-02 -1.34840461e-01 -3.70233765e-01  5.68971429e-02
  1.60495307e-01  6.22854442e-02  8.78717776e-02  7.45332718e-02
  1.04439750e-01 -2.38184

In [11]:
avg_embedding_reviews = []
for i in range(0,len(content_review_tokens)):
  sample_review = content_review_tokens[i]
  average_embedding = average_word_embedding(sample_review, skipgram_model)
  avg_embedding_reviews.append(average_embedding)
print(len(avg_embedding_reviews))

21000


In [12]:
#df.columns

emotion_df = df[['VERIFIED_PURCHASE', 'OPI_FIN_POS', 'OPI_FIN_NEG', 'BL_POS', 'BL_NEG', 'AFINN_POS',
       'AFINN_NEG', 'S140_POS', 'S140_NEG', 'SWN_POS', 'SWN_NEG',
       'NRC_HASH_POS', 'NRC_HASH_NEG', 'EMOTICON_POS', 'EMOTICON_NEG',
       'NRC_ANGER', 'NRC_ANTICIPATION', 'NRC_DISGUST', 'NRC_FEAR', 'NRC_JOY',
       'NRC_SADNESS', 'NRC_SURPRISE', 'NRC_TRUST', 'NRC_EXP_ANGER',
       'NRC_EXP_ANTICIPATION', 'NRC_EXP_DISGUST', 'NRC_EXP_FEAR',
       'NRC_EXP_JOY', 'NRC_EXP_SADNESS', 'NRC_EXP_SURPRISE', 'NRC_EXP_TRUST']]

emotion_X = emotion_df.values.tolist()

In [13]:
X = []

for i in range(21000):
    X.append(list(X_ngrams_lsa[i]) + avg_embedding_reviews[i].tolist() + emotion_X[i])
    
X = np.array(X)
print(len(X))
print(len(X[0]))


21000
331


In [14]:

print(type(X))
print(X.shape)

<class 'numpy.ndarray'>
(21000, 331)


In [15]:
X_train = np.concatenate((X[:8400], X[10500:18900]))
X_test = np.concatenate((X[8400:10500], X[18900:]))
y_train = np.concatenate((df['TARGET'].values[:8400], df['TARGET'].values[10500:18900]))
y_test = np.concatenate((df['TARGET'].values[8400:10500], df['TARGET'].values[18900:]))

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)
print(y_test)

(16800, 331) (4200, 331) (16800,) (4200,)
[0 0 0 ... 1 1 1]


In [22]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Input layer size based on features
input_layer_size = len(X[0])  # Adjust based on the actual number of features

# Hyperparameters
hidden_layer1_neurons = 100
hidden_layer2_neurons = 50
dropout_rate = 0.5  # Adjust as needed

# Define the DFFNN model
model = Sequential()

# Input layer
model.add(Dense(hidden_layer1_neurons, input_dim=input_layer_size, activation='relu'))
model.add(Dropout(0.2))

# First hidden layer
model.add(Dense(hidden_layer2_neurons, activation='relu'))
model.add(Dropout(dropout_rate))

# Second hidden layer
model.add(Dense(hidden_layer2_neurons, activation='relu'))
model.add(Dropout(dropout_rate))

# Output layer (binary classification with sigmoid activation)
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.1), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")


Epoch 1/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 922us/step - accuracy: 0.4993 - loss: 7.8507 - val_accuracy: 0.5000 - val_loss: 0.6941
Epoch 2/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 890us/step - accuracy: 0.4933 - loss: 1.2598 - val_accuracy: 0.5000 - val_loss: 0.6931
Epoch 3/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 802us/step - accuracy: 0.4945 - loss: 0.8641 - val_accuracy: 0.4998 - val_loss: 0.7158
Epoch 4/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 728us/step - accuracy: 0.4992 - loss: 0.6956 - val_accuracy: 0.5000 - val_loss: 0.7149
Epoch 5/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 788us/step - accuracy: 0.4971 - loss: 0.7485 - val_accuracy: 0.5000 - val_loss: 0.6937
Epoch 6/10
[1m525/525[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 712us/step - accuracy: 0.4941 - loss: 0.6958 - val_accuracy: 0.5000 - val_loss: 0.6945
Epoch 7/10
[1m5

In [24]:
y_pred = model.predict(X_test)
#y_pred = (int(y_pred>0.25)

for i in range(len(y_pred)):
    if y_pred[i] > 0.4:
        y_pred[i] = 1
    else:
        y_pred[i] = 0
        
        
print(y_pred)

[1m132/132[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 375us/step
[1.]


In [19]:

device = 'cuda' if torch.cuda.is_available() else 'cpu'