# Computational Final Project

## Setup and Imports

In [54]:
# Import required libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import pairwise_distances_argmin_min
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import confusion_matrix
from scipy.stats import mode

from sklearn.feature_extraction.text import TfidfVectorizer


## Loading and Preprocessing the Data

In [55]:
data = pd.read_csv("../data/fake reviews dataset.csv")

data.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5,CG,Very nice set. Good quality. We have had the s...


In [56]:
from textblob import TextBlob
# 1. Text Length
data['text_length'] = data['text_'].apply(len)

from textblob import TextBlob

# Define a function to calculate sentiment polarity
def get_sentiment_score(text):
    return TextBlob(text).sentiment.polarity

# Add sentiment score as a new column
data['sentiment_score'] = data['text_'].apply(get_sentiment_score)


In [57]:
data['unique_word_ratio'] = data['text_'].apply(lambda x: len(set(x.split())) / len(x.split()))


In [58]:
vectorizer = TfidfVectorizer(max_features=100)
tfidf_features = vectorizer.fit_transform(data['text_']).toarray()
tfidf_df = pd.DataFrame(tfidf_features, columns=[f'tfidf_{i}' for i in range(tfidf_features.shape[1])])
df = pd.concat([data, tfidf_df], axis=1)

In [59]:
# One-hot encoding for category
data = pd.get_dummies(data, columns=['category'], drop_first=True)


#### Final Feature Set

In [60]:
# Prepare the feature set for anomaly detection
# features = data[['text_length', 'rating', 'sentiment_score','unique_word_ratio']]
features = df[['text_length', 'rating', 'sentiment_score', 'unique_word_ratio'] + list(tfidf_df.columns)]



In [61]:
# Standardize the features
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)


In [62]:
pca = PCA(n_components=5)
reduced_features = pca.fit_transform(features_scaled)

In [63]:
kmeans = KMeans(n_clusters=2, random_state=42)
clusters = kmeans.fit_predict(reduced_features)

In [64]:
df['cluster'] = clusters
cluster_mapping = {}
for i in range(2):
    label_counts = df[df['cluster'] == i]['label'].value_counts()
    predominant_label = label_counts.idxmax()
    cluster_mapping[i] = predominant_label

df['predicted_label'] = df['cluster'].map(cluster_mapping)

# Confusion Matrix
conf_matrix = confusion_matrix(df['label'], df['predicted_label'], labels=['CG', 'OR'])

# Display results
print("Confusion Matrix:")
print(conf_matrix)

Confusion Matrix:
[[ 5082 15134]
 [ 4466 15750]]
