In [1]:
# Data Manipulation
import pandas as pd
import numpy as np

# Text Processing and NLP
import re
import string
import emoji
import contractions
from bs4 import BeautifulSoup
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
from sentence_transformers import SentenceTransformer, util

# Machine Learning
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.semi_supervised import LabelPropagation
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.metrics import classification_report, accuracy_score, silhouette_score

# Visualization
import matplotlib.pyplot as plt

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')

  from .autonotebook import tqdm as notebook_tqdm





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ngmin\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ngmin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ngmin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ngmin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\ngmin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [2]:
resources = ['punkt', 'stopwords', 'wordnet', 'omw-1.4', 'averaged_perceptron_tagger_eng']
for res in resources:
    try:
        nltk.data.find(f'tokenizers/{res}')
        print(f"{res} is already downloaded.")
    except LookupError:
        print(f"{res} is missing! Downloading now...")
        nltk.download(res)


punkt is already downloaded.
stopwords is missing! Downloading now...
wordnet is missing! Downloading now...
omw-1.4 is missing! Downloading now...
averaged_perceptron_tagger_eng is missing! Downloading now...


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ngmin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ngmin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\ngmin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\ngmin\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


# Load Data

In [3]:
metadata = pd.read_csv('00_dataset/YelpZip/metadata', sep="\t", header=None, names=["user_id", "prod_id", "rating", "label", "date"])
productIdMapping = pd.read_csv('00_dataset/YelpZip/productIdMapping', sep="\t", header=None)
reviewContent = pd.read_csv('00_dataset/YelpZip/reviewContent', sep="\t", header=None, names=["user_id", "prod_id", "date", "review_text"])
reviewGraph = pd.read_csv('00_dataset/YelpZip/reviewGraph', sep="\t", header=None, names=["user_id", "prod_id", "rating"])
userIdMapping = pd.read_csv('00_dataset/YelpZip/userIdMapping', sep="\t", header=None)

## Data Preview

In [4]:
reviewContent.head()

Unnamed: 0,user_id,prod_id,date,review_text
0,5044,0,2014-11-16,"Drinks were bad, the hot chocolate was watered..."
1,5045,0,2014-09-08,This was the worst experience I've ever had a ...
2,5046,0,2013-10-06,This is located on the site of the old Spruce ...
3,5047,0,2014-11-30,I enjoyed coffee and breakfast twice at Toast ...
4,5048,0,2014-08-28,I love Toast! The food choices are fantastic -...


In [5]:
reviewContent.tail()

Unnamed: 0,user_id,prod_id,date,review_text
608453,119664,5039,2013-01-20,When I first moved to the area I must say I wa...
608454,56277,5039,2012-11-12,Kind of pricey. I guess I expected a ridiculou...
608455,265320,5039,2012-08-22,"Stopped by this restaurant yesterday, we just ..."
608456,161722,5039,2011-05-11,Finally checked out The Best Subs in Claremont...
608457,78454,5039,2010-07-17,"Just got me some ""Best Subs"" and I gotta say, ..."


In [6]:
metadata.head()

Unnamed: 0,user_id,prod_id,rating,label,date
0,5044,0,1.0,-1,2014-11-16
1,5045,0,1.0,-1,2014-09-08
2,5046,0,3.0,-1,2013-10-06
3,5047,0,5.0,-1,2014-11-30
4,5048,0,5.0,-1,2014-08-28


In [7]:
metadata.tail()

Unnamed: 0,user_id,prod_id,rating,label,date
608593,119664,5039,4.0,1,2013-01-20
608594,56277,5039,2.0,1,2012-11-12
608595,265320,5039,1.0,1,2012-08-22
608596,161722,5039,4.0,1,2011-05-11
608597,78454,5039,4.0,1,2010-07-17


In [8]:
# Check the unique values in the last column (label column)
label_counts = metadata["label"].value_counts()

# Calculate the percentage of each label
label_percentages = (label_counts / label_counts.sum()) * 100

# Display results
label_counts, label_percentages

(label
  1    528132
 -1     80466
 Name: count, dtype: int64,
 label
  1    86.778465
 -1    13.221535
 Name: count, dtype: float64)

In [9]:
reviewGraph.head()

Unnamed: 0,user_id,prod_id,rating
0,5044,0,1.0
1,5045,0,1.0
2,5046,0,3.0
3,5047,0,5.0
4,5048,0,5.0


In [10]:
reviewGraph.tail()

Unnamed: 0,user_id,prod_id,rating
608593,119664,5039,4.0
608594,56277,5039,2.0
608595,265320,5039,1.0
608596,161722,5039,4.0
608597,78454,5039,4.0


### Check Null Values

In [11]:
reviewContent.isnull().sum()

user_id        0
prod_id        0
date           0
review_text    0
dtype: int64

In [12]:
metadata.isnull().sum()

user_id    0
prod_id    0
rating     0
label      0
date       0
dtype: int64

In [13]:
reviewGraph.isnull().sum()

user_id    0
prod_id    0
rating     0
dtype: int64

### Check Data Types

In [14]:
# Count unique data types in the column
print(reviewContent["review_text"].apply(type).value_counts())

review_text
<class 'str'>    608458
Name: count, dtype: int64


In [15]:
reviewContent.dtypes

user_id         int64
prod_id         int64
date           object
review_text    object
dtype: object

In [16]:
metadata.dtypes

user_id      int64
prod_id      int64
rating     float64
label        int64
date        object
dtype: object

In [17]:
reviewGraph.dtypes

user_id      int64
prod_id      int64
rating     float64
dtype: object

### Drop columns

In [18]:
# Convert review_text to string
reviewContent["review_text"] = reviewContent["review_text"].astype(str)

# Drop the date column
reviewContent.drop(columns=["date"], inplace=True)

In [19]:
metadata.drop(columns=["date"], inplace=True)

## Merge reviewContent with metadata

In [20]:
print("reviewContent columns:", reviewContent.columns)
print("metadata columns:", metadata.columns)

reviewContent columns: Index(['user_id', 'prod_id', 'review_text'], dtype='object')
metadata columns: Index(['user_id', 'prod_id', 'rating', 'label'], dtype='object')


In [21]:
df = reviewContent.merge(metadata, on=["user_id", "prod_id"], how="inner")

In [22]:
df.head()

Unnamed: 0,user_id,prod_id,review_text,rating,label
0,5044,0,"Drinks were bad, the hot chocolate was watered...",1.0,-1
1,5045,0,This was the worst experience I've ever had a ...,1.0,-1
2,5046,0,This is located on the site of the old Spruce ...,3.0,-1
3,5047,0,I enjoyed coffee and breakfast twice at Toast ...,5.0,-1
4,5048,0,I love Toast! The food choices are fantastic -...,5.0,-1


#### Convert Spam Labels for ML

In [23]:
df["label"] = df["label"].replace({1: 0, -1: 1})

In [24]:
df.head()

Unnamed: 0,user_id,prod_id,review_text,rating,label
0,5044,0,"Drinks were bad, the hot chocolate was watered...",1.0,1
1,5045,0,This was the worst experience I've ever had a ...,1.0,1
2,5046,0,This is located on the site of the old Spruce ...,3.0,1
3,5047,0,I enjoyed coffee and breakfast twice at Toast ...,5.0,1
4,5048,0,I love Toast! The food choices are fantastic -...,5.0,1


In [25]:
df.isnull().sum()

user_id        0
prod_id        0
review_text    0
rating         0
label          0
dtype: int64

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 608458 entries, 0 to 608457
Data columns (total 5 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      608458 non-null  int64  
 1   prod_id      608458 non-null  int64  
 2   review_text  608458 non-null  object 
 3   rating       608458 non-null  float64
 4   label        608458 non-null  int64  
dtypes: float64(1), int64(3), object(1)
memory usage: 23.2+ MB


In [27]:
df_sw = df.copy()

# Text Preprocessing
[Text Preprocessing](https://pub.aimind.so/a-comprehensive-guide-to-text-preprocessing-for-twitter-data-getting-ready-for-sentiment-analysis-e7f91cd03671)

In the context of spam review detection, it's crucial to retain certain textual elements during preprocessing to preserve features that can aid in distinguishing between genuine and spam content. Based on your current clean_text function, here are some recommendations:

1. Retain Punctuation and Capitalization: As previously discussed, punctuation marks (e.g., exclamation points) and capitalization can be indicative of spammy content. Therefore, avoid removing them during preprocessing.

2. Preserve Numerical Data: Numbers can provide context, such as prices or dates, which might be relevant in identifying spam. Consider not removing numerical data unless you have a specific reason to do so.

3. Handle Contractions Appropriately: Expanding contractions (e.g., "don't" to "do not") can be beneficial, but ensure that the library or method you use handles them accurately. Misinterpretation can introduce noise into your data.

4. Stop Words Consideration: While removing stop words is common, some stop words might carry sentiment or context important for spam detection. Evaluate whether removing all stop words is beneficial for your specific case.

5. Tokenization and Lemmatization: Your current approach to tokenization and lemmatization is sound. Ensure that the lemmatization process retains the original casing if capitalization is deemed important.

In [28]:
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

# POS tag mapping dictionary
wordnet_map = {"N": wordnet.NOUN, "V": wordnet.VERB, "J": wordnet.ADJ, "R": wordnet.ADV}

# Function to perform Lemmatization on a text
def lemmatize_text(tokens):
  pos_tags = nltk.pos_tag(tokens) # Get the POS tags for the words

  lemmatized_words = []

  # Perform Lemmatization
  for word, tag in pos_tags:
    pos = wordnet_map.get(tag[0].upper(), wordnet.NOUN) # Map the POS tag to WordNet POS tag
    lemmatized_word = lemmatizer.lemmatize(word, pos=pos) # Lemmatize the word with the appropriate POS tag
    lemmatized_words.append(lemmatized_word) # Add the lemmatized word to the list
  return lemmatized_words

# with stop words
def clean_text_sw(text):
  text = re.sub(r"http\S+|www.\S+", "", text) # Removal of URLs tags
  text = BeautifulSoup(text, "lxml").text # Removal HTML tags
  text = contractions.fix(text) # Removing contractions

  text = re.sub(r'\s+', ' ', text).strip() # Removal of extra spaces

  tokens = word_tokenize(text) # Tokenization
  tokens = lemmatize_text(tokens) # Apply lemmatization

  return " ".join(tokens)

# without stop words
def clean_text(text):
  text = re.sub(r"http\S+|www.\S+", "", text) # Removal of URLs tags
  text = BeautifulSoup(text, "lxml").text # Removal HTML tags
  text = contractions.fix(text) # Removing contractions

  text = re.sub(r'\s+', ' ', text).strip() # Removal of extra spaces

  tokens = word_tokenize(text) # Tokenization
  tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
  tokens = lemmatize_text(tokens) # Apply lemmatization

  return " ".join(tokens)

In [29]:
df_sw['cleaned_text_sw'] = df_sw['review_text'].apply(clean_text_sw)
df['cleaned_text'] = df['review_text'].apply(clean_text)

  text = BeautifulSoup(text, "lxml").text # Removal HTML tags
  text = BeautifulSoup(text, "lxml").text # Removal HTML tags


In [30]:
df.head()

Unnamed: 0,user_id,prod_id,review_text,rating,label,cleaned_text
0,5044,0,"Drinks were bad, the hot chocolate was watered...",1.0,1,"Drinks bad , hot chocolate water latte burnt t..."
1,5045,0,This was the worst experience I've ever had a ...,1.0,1,This bad experience I ever casual coffee/light...
2,5046,0,This is located on the site of the old Spruce ...,3.0,1,This locate site old Spruce St. Video . The mi...
3,5047,0,I enjoyed coffee and breakfast twice at Toast ...,5.0,1,I enjoy coffee breakfast twice Toast recent vi...
4,5048,0,I love Toast! The food choices are fantastic -...,5.0,1,I love Toast ! The food choice fantastic - I l...


In [31]:
df_sw.head()

Unnamed: 0,user_id,prod_id,review_text,rating,label,cleaned_text_sw
0,5044,0,"Drinks were bad, the hot chocolate was watered...",1.0,1,"Drinks be bad , the hot chocolate be water dow..."
1,5045,0,This was the worst experience I've ever had a ...,1.0,1,This be the bad experience I have ever have a ...
2,5046,0,This is located on the site of the old Spruce ...,3.0,1,This be locate on the site of the old Spruce S...
3,5047,0,I enjoyed coffee and breakfast twice at Toast ...,5.0,1,I enjoy coffee and breakfast twice at Toast du...
4,5048,0,I love Toast! The food choices are fantastic -...,5.0,1,I love Toast ! The food choice be fantastic - ...


In [37]:
# Save to a CSV
df_sw.to_csv("00_dataset/with_stopwords/cleaned_reviews_sw.csv", index=False)
df.to_csv("00_dataset/without_stopwords/cleaned_reviews.csv", index=False)