# **Computational Tools - Final Project**

In [2]:
import pandas as pd
data = pd.read_csv("../data/fake reviews dataset.csv")
data.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5,CG,Very nice set. Good quality. We have had the s...


In [3]:
data.describe()

Unnamed: 0,rating
count,40432.0
mean,4.256579
std,1.144354
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [4]:
data.duplicated().sum()

12

In [5]:
data.drop_duplicates()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5,CG,Very nice set. Good quality. We have had the s...
...,...,...,...,...
40427,Clothing_Shoes_and_Jewelry_5,4,OR,I had read some reviews saying that this bra r...
40428,Clothing_Shoes_and_Jewelry_5,5,CG,I wasn't sure exactly what it would be. It is ...
40429,Clothing_Shoes_and_Jewelry_5,2,OR,"You can wear the hood by itself, wear it with ..."
40430,Clothing_Shoes_and_Jewelry_5,1,CG,I liked nothing about this dress. The only rea...


In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, silhouette_score
from sklearn.decomposition import PCA
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk

In [7]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/joninamatt/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/joninamatt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
data = pd.read_csv("../data/fake reviews dataset.csv")
texts = data['text_']
true_labels = data['label'].map({'CG': 0, 'OR': 1})  # Convert labels to binary


In [9]:
# Feature extraction
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X = vectorizer.fit_transform(data['text_'])

# Dimensionality reduction (optional)
pca = PCA(n_components=50)
X_reduced = pca.fit_transform(X.toarray())

# k-Means clustering
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(X_reduced)

# Evaluate clusters
accuracy = accuracy_score(true_labels, clusters)

print(f'Accuracy: {accuracy}')


Accuracy: 0.5193658488326078


# **Locality Sensitivity Hashing**

In [20]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from datasketch import MinHash, MinHashLSH
import json

# Load the preprocessed dataset
# Load the CSV
df = pd.read_csv('../data/processed_dataFINAL.csv')

# Convert JSON strings back to Python lists
df['processed_text'] = df['processed_text'].apply(json.loads)

# Ensure 'processed_text' is a string
df['processed_text'] = df['processed_text'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

# Do label_binary based on label
df['label_binary'] = df['label'].apply(lambda x: 1 if x == 'CG' else 0)

# Vectorize text using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_tfidf = tfidf_vectorizer.fit_transform(df['processed_text'])

# Create MinHash for each review
def create_minhash(text):
    m = MinHash(num_perm=128)
    for word in text.split():
        m.update(word.encode('utf8'))
    return m

df['minhash'] = df['processed_text'].apply(create_minhash)

# Create LSH index
lsh = MinHashLSH(threshold=0.5, num_perm=128)
for i, m in enumerate(df['minhash']):
    lsh.insert(str(i), m)

# Assign predicted label based on nearest neighbors in LSH index
def predict_label_using_lsh(index):
    similar_items = lsh.query(df['minhash'][index])
    
    # Count the number of 'CG' vs 'OR' labels in similar reviews
    similar_labels = df.iloc[[int(i) for i in similar_items]]['label_binary']
    predicted_label = similar_labels.mode()[0]  # Take the most common label in the similar items
    return predicted_label

# Apply LSH-based prediction to the entire dataset
df['predicted_label'] = df.index.map(predict_label_using_lsh)

# Evaluate performance
accuracy = accuracy_score(df['label_binary'], df['predicted_label'])
classification_rep = classification_report(df['label_binary'], df['predicted_label'])

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{classification_rep}")


Accuracy: 0.9510288880094975
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.94      0.95     20216
           1       0.94      0.96      0.95     20216

    accuracy                           0.95     40432
   macro avg       0.95      0.95      0.95     40432
weighted avg       0.95      0.95      0.95     40432

