In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import pipeline
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import warnings
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Load Dataset

In [2]:
df = pd.read_csv("final_data/cleaned_dataset.csv") 
df.head()

Unnamed: 0,user_id,user_name,time,rating,review,gmap_id,business_name,latitude,longitude,business_desc,avg_rating,num_of_reviews,label
0,1.1e+20,Michelle Banks,2018-03-02 14:13:20,5.0,It's a beautiful place to read books and have ...,0x80c8bf81f68a634f:0xe605b4c3043783c9,Barnes & Noble,36.157754,-115.289418,"['Book store', 'Cafe', 'Childrens book store',...",4.6,1719,relevant
1,1.06e+20,Steven DeRyck [Staff],2018-10-20 01:46:40,4.0,"As previous reviews have stated, two small pie...",0x80c8c415f0a42c77:0x55c554fdc4ad8b9c,Carnegie Deli,36.120556,-115.173611,"['Deli', 'Takeout Restaurant', 'Sandwich shop']",4.1,706,relevant
2,1.1e+20,Stevey Markovich,2020-09-13 12:26:40,5.0,Absolutely love this office! Afton is truly am...,0x80c8ce0f7732ee7b:0xea13348742f64327,Center for Cosmetic and Family Dentistry,36.001929,-115.107484,['Dentist'],4.9,318,relevant
3,1.02e+20,William Campbell,2018-10-20 01:46:40,3.0,The food is as good as it usually is,0x80c8dc9da25847c7:0x27b862b824ac757c,Asian Garden,36.168901,-115.060601,"['Restaurant', 'Asian restaurant', 'Chinese re...",3.8,128,relevant
4,1.12e+20,Beverly Thorman,2018-03-02 14:13:20,5.0,We came in without an appointment on a Saturda...,0x80c8c03de37488fd:0xdc3302fd9f8f44a,Great Clips,36.191055,-115.258969,"['Hair salon', 'Beauty salon']",4.3,168,relevant


# Repetitiveness Score on `review` for each business

In [3]:
def safe_review_text(x):
    if pd.isna(x):
        return ""   # replace NaN with empty string
    return str(x)

def review_level_repetitiveness(reviews):
    reviews = [safe_review_text(r) for r in reviews]
    if len(reviews) == 1:
        return [0.0]

    vectorizer = TfidfVectorizer(stop_words="english")
    tfidf = vectorizer.fit_transform(reviews)
    sim_matrix = cosine_similarity(tfidf)

    scores = []
    for i in range(len(reviews)):
        sims = np.delete(sim_matrix[i], i)
        avg_sim = sims.mean()
        max_sim = sims.max()
        score = 0.7 * avg_sim + 0.3 * max_sim
        scores.append(score)
    return scores

def compute_repetitiveness(df):
    scores = np.zeros(len(df))  # placeholder array
    for gmap_id, group in df.groupby("gmap_id"):
        reviews = group["review"].tolist()
        group_scores = review_level_repetitiveness(reviews)
        scores[group.index] = group_scores
    df["repetitiveness_score"] = scores
    return df


df = compute_repetitiveness(df)

print(df[["review", "repetitiveness_score"]].head())
print(df["repetitiveness_score"].describe())

                                              review  repetitiveness_score
0  It's a beautiful place to read books and have ...              0.063355
1  As previous reviews have stated, two small pie...              0.011177
2  Absolutely love this office! Afton is truly am...              0.000000
3               The food is as good as it usually is              0.000000
4  We came in without an appointment on a Saturda...              0.000000
count    19490.000000
mean         0.113701
std          0.172934
min          0.000000
25%          0.000000
50%          0.054619
75%          0.156233
max          0.955507
Name: repetitiveness_score, dtype: float64


# Sentiment Analysis on `review`

In [4]:
df = df[df['review'].notnull()].reset_index(drop=True)

# Initialize sentiment analysis pipeline
classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english")

# Function to get label and score
def get_sentiment_with_score(text):
    result = classifier(text[:512])[0]  # truncate long text
    return pd.Series([result['label'], result['score']])

# Apply function
df[['sentiment', 'sentiment_score']] = df['review'].apply(get_sentiment_with_score)

# Assign human-readable labels with Neutral class
def label_with_neutral(row, threshold_low=0.4, threshold_high=0.6):
    if row['sentiment_score'] >= threshold_high:
        return "Positive" if row['sentiment'] == "POSITIVE" else "Negative"
    elif row['sentiment_score'] <= threshold_low:
        return "Negative" if row['sentiment'] == "NEGATIVE" else "Positive"
    else:
        return "Neutral"

df['sentiment_label'] = df.apply(label_with_neutral, axis=1)

# Optional: check distribution
print(df['sentiment_label'].value_counts())

Device set to use mps:0


sentiment_label
Positive    11532
Negative     7640
Neutral       318
Name: count, dtype: int64


# `Sentiment Analysis` against `rating`

In [5]:
def check_mismatch(row):
    if row['rating'] >= 4 and row['sentiment_label'] == "Negative":
        return "Suspicious"
    elif row['rating'] <= 2 and row['sentiment_label'] == "Positive":
        return "Suspicious"
    else:
        return "Legit"

df['suspicion'] = df.apply(check_mismatch, axis=1)

In [6]:
print(df['suspicion'].value_counts())

suspicion
Legit         15403
Suspicious     4087
Name: count, dtype: int64


# Length of `review`

In [7]:
df['review_length_words'] = df['review'].astype(str).apply(lambda x: len(x.split()))
df[['review', 'review_length_words']].head()

Unnamed: 0,review,review_length_words
0,It's a beautiful place to read books and have ...,17
1,"As previous reviews have stated, two small pie...",39
2,Absolutely love this office! Afton is truly am...,16
3,The food is as good as it usually is,9
4,We came in without an appointment on a Saturda...,48


# Embedding of `review`

In [8]:
# Load SBERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Keep only non-null reviews and reset index
df = df[df['review'].notnull()].reset_index(drop=True)

# Generate embeddings
embeddings = model.encode(df['review'].astype(str).tolist(), batch_size=32, show_progress_bar=True)

# Append embeddings as a new column
df['review_embedding'] = list(embeddings)

Batches:   0%|          | 0/610 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Batches: 100%|██████████| 610/610 [00:54<00:00, 11.17it/s]


# Embedding of `business_desc`

In [9]:
import ast

# Convert string to list
df['business_desc_list'] = df['business_desc'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

# Join into a single string per row
df['business_desc_str'] = df['business_desc_list'].apply(lambda x: " ".join(x))

# Generate embeddings
desc_embeddings = model.encode(df['business_desc_str'].tolist(), batch_size=32, show_progress_bar=True)

# Append embeddings as a new column
df['business_desc_embedding'] = list(desc_embeddings)

# Optional: drop temporary columns
df.drop(columns=['business_desc_list', 'business_desc_str'], inplace=True)

Batches: 100%|██████████| 610/610 [00:13<00:00, 46.85it/s]


# `Name Validity` boolean column

In [10]:
import spacy

# Download the en_core_web_sm model
# python -m spacy download en_core_web_sm

# Load English NLP pipeline with NER (Named Entity Recognition)
nlp = spacy.load("en_core_web_sm")

def get_validity_of_name(name):
    """
    Check if the string contains a PERSON entity according to spaCy.
    """
    # Handle NaN or non-string inputs
    if not isinstance(name, str) or not name.strip():
        return False
    
    # Handle all string inputs
    doc = nlp(str(name))
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            return True
    return False


# Apply the function to the 'user_name' column
df["name_validity"] = df["user_name"].apply(get_validity_of_name)
df.head()

Unnamed: 0,user_id,user_name,time,rating,review,gmap_id,business_name,latitude,longitude,business_desc,...,label,repetitiveness_score,sentiment,sentiment_score,sentiment_label,suspicion,review_length_words,review_embedding,business_desc_embedding,name_validity
0,1.1e+20,Michelle Banks,2018-03-02 14:13:20,5.0,It's a beautiful place to read books and have ...,0x80c8bf81f68a634f:0xe605b4c3043783c9,Barnes & Noble,36.157754,-115.289418,"['Book store', 'Cafe', 'Childrens book store',...",...,relevant,0.063355,POSITIVE,0.999885,Positive,Legit,17,"[0.019050553, -0.007937221, -0.021419825, 0.06...","[-0.001835555, 0.00961866, 0.028814249, 0.0438...",True
1,1.06e+20,Steven DeRyck [Staff],2018-10-20 01:46:40,4.0,"As previous reviews have stated, two small pie...",0x80c8c415f0a42c77:0x55c554fdc4ad8b9c,Carnegie Deli,36.120556,-115.173611,"['Deli', 'Takeout Restaurant', 'Sandwich shop']",...,relevant,0.011177,POSITIVE,0.997185,Positive,Legit,39,"[-0.03985757, 0.0456756, 0.03532694, 0.0130935...","[-0.04080368, 0.026135705, 0.031749316, -0.081...",True
2,1.1e+20,Stevey Markovich,2020-09-13 12:26:40,5.0,Absolutely love this office! Afton is truly am...,0x80c8ce0f7732ee7b:0xea13348742f64327,Center for Cosmetic and Family Dentistry,36.001929,-115.107484,['Dentist'],...,relevant,0.0,POSITIVE,0.999889,Positive,Legit,16,"[-0.04052647, -0.030089611, -0.009051651, -0.0...","[-0.08501966, 0.02050874, -0.044369426, -0.003...",True
3,1.02e+20,William Campbell,2018-10-20 01:46:40,3.0,The food is as good as it usually is,0x80c8dc9da25847c7:0x27b862b824ac757c,Asian Garden,36.168901,-115.060601,"['Restaurant', 'Asian restaurant', 'Chinese re...",...,relevant,0.0,POSITIVE,0.999851,Positive,Legit,9,"[0.013494, -0.01855295, 0.03622511, 0.10009426...","[-0.023859808, 0.056261025, 0.040953994, 0.040...",True
4,1.12e+20,Beverly Thorman,2018-03-02 14:13:20,5.0,We came in without an appointment on a Saturda...,0x80c8c03de37488fd:0xdc3302fd9f8f44a,Great Clips,36.191055,-115.258969,"['Hair salon', 'Beauty salon']",...,relevant,0.0,POSITIVE,0.999873,Positive,Legit,48,"[-0.0027537446, 0.010088735, 0.04393, 0.019915...","[0.022430552, -0.01589779, 0.012581493, 0.0249...",True


# Save Dataset

In [11]:
df.to_csv("final_data/feature_engineered_dataset_reviews.csv", index=False, encoding='utf-8-sig')