In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from textblob import TextBlob

# Load your dataset
df = pd.read_csv("../data/fake reviews dataset.csv")

# Step 1: Feature Engineering
df['text_length'] = df['text_'].apply(len)
df['sentiment_score'] = df['text_'].apply(lambda text: TextBlob(text).sentiment.polarity)
df['unique_word_ratio'] = df['text_'].apply(lambda x: len(set(x.split())) / len(x.split()))

# TF-IDF Vectorization for text features
vectorizer = TfidfVectorizer(max_features=100)
tfidf_features = vectorizer.fit_transform(df['text_']).toarray()
tfidf_df = pd.DataFrame(tfidf_features, columns=[f'tfidf_{i}' for i in range(tfidf_features.shape[1])])
df = pd.concat([df, tfidf_df], axis=1)

# Select features for clustering
features = df[['text_length', 'rating', 'sentiment_score', 'unique_word_ratio'] + list(tfidf_df.columns)]

# Step 2: Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Optional Step: Dimensionality Reduction with PCA
pca = PCA(n_components=5)
reduced_features = pca.fit_transform(scaled_features)
print('fit')
# Step 3: Agglomerative Clustering
# Configure the model to find 2 clusters (CG and OR)
agglomerative = AgglomerativeClustering(n_clusters=2)
clusters = agglomerative.fit_predict(reduced_features)

# Step 4: Map clusters to predominant labels
df['cluster'] = clusters
cluster_mapping = {}
for i in set(df['cluster']):
    label_counts = df[df['cluster'] == i]['label'].value_counts()
    predominant_label = label_counts.idxmax()
    cluster_mapping[i] = predominant_label

df['predicted_label'] = df['cluster'].map(cluster_mapping)

# Step 5: Confusion Matrix
conf_matrix = confusion_matrix(df['label'], df['predicted_label'], labels=['CG', 'OR'])

# Display the Confusion Matrix
print("Confusion Matrix:")
print(conf_matrix)


fit


In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from textblob import TextBlob

# Load your dataset
df = pd.read_csv("../data/fake reviews dataset.csv")

# Step 1: Feature Engineering
df['text_length'] = df['text_'].apply(len)
df['sentiment_score'] = df['text_'].apply(lambda text: TextBlob(text).sentiment.polarity)
df['unique_word_ratio'] = df['text_'].apply(lambda x: len(set(x.split())) / len(x.split()))

# TF-IDF Vectorization for text features
vectorizer = TfidfVectorizer(max_features=50)  # Reduced max features for efficiency
tfidf_features = vectorizer.fit_transform(df['text_']).toarray()
tfidf_df = pd.DataFrame(tfidf_features, columns=[f'tfidf_{i}' for i in range(tfidf_features.shape[1])])
df = pd.concat([df, tfidf_df], axis=1)

# Select features for clustering
features = df[['text_length', 'rating', 'sentiment_score', 'unique_word_ratio'] + list(tfidf_df.columns)]

# Step 2: Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Optional Step: Dimensionality Reduction with PCA
pca = PCA(n_components=10)  # Increased components to capture more variance
reduced_features = pca.fit_transform(scaled_features)

# Step 3: MiniBatchKMeans Clustering
minibatch_kmeans = MiniBatchKMeans(n_clusters=3, batch_size=1000, random_state=42)
clusters = minibatch_kmeans.fit_predict(reduced_features)

# Step 4: Map clusters to predominant labels
df['cluster'] = clusters
cluster_mapping = {}
for i in set(df['cluster']):
    label_counts = df[df['cluster'] == i]['label'].value_counts()
    predominant_label = label_counts.idxmax()
    cluster_mapping[i] = predominant_label

df['predicted_label'] = df['cluster'].map(cluster_mapping)

# Step 5: Confusion Matrix
conf_matrix = confusion_matrix(df['label'], df['predicted_label'], labels=['CG', 'OR'])

# Display the Confusion Matrix
print("Confusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[ 4903 15313]
 [ 3507 16709]]


In [5]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from textblob import TextBlob
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load your dataset
df = pd.read_csv("../data/fake reviews dataset.csv")

# Step 1: Feature Engineering
df['text_length'] = df['text_'].apply(len)
df['sentiment_score'] = df['text_'].apply(lambda text: TextBlob(text).sentiment.polarity)
df['unique_word_ratio'] = df['text_'].apply(lambda x: len(set(x.split())) / len(x.split()))

# Additional Features
df['average_word_length'] = df['text_'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))
df['punctuation_density'] = df['text_'].apply(lambda x: sum(1 for char in x if char in string.punctuation) / len(x))
df['stopword_ratio'] = df['text_'].apply(lambda x: sum(1 for word in x.split() if word.lower() in stop_words) / len(x.split()))
df['uppercase_word_ratio'] = df['text_'].apply(lambda x: sum(1 for word in x.split() if word.isupper()) / len(x.split()))
df['sentence_count'] = df['text_'].apply(lambda x: len(x.split('.')))
df['average_sentence_length'] = df['text_'].apply(lambda x: len(x.split()) / (len(x.split('.')) + 1))  # +1 to avoid division by zero

# TF-IDF Vectorization for text features
vectorizer = TfidfVectorizer(max_features=50)  # Reduced max features for efficiency
tfidf_features = vectorizer.fit_transform(df['text_']).toarray()
tfidf_df = pd.DataFrame(tfidf_features, columns=[f'tfidf_{i}' for i in range(tfidf_features.shape[1])])
df = pd.concat([df, tfidf_df], axis=1)

# Select features for clustering
features = df[['text_length', 'rating', 'sentiment_score', 'unique_word_ratio', 'average_word_length',
               'punctuation_density', 'stopword_ratio', 'uppercase_word_ratio', 'sentence_count',
               'average_sentence_length'] + list(tfidf_df.columns)]

# Step 2: Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Optional Step: Dimensionality Reduction with PCA
pca = PCA(n_components=10)  # Increased components to capture more variance
reduced_features = pca.fit_transform(scaled_features)

# Step 3: MiniBatchKMeans Clustering
minibatch_kmeans = MiniBatchKMeans(n_clusters=3, batch_size=10000, random_state=42)
clusters = minibatch_kmeans.fit_predict(reduced_features)

# Step 4: Map clusters to predominant labels
df['cluster'] = clusters
cluster_mapping = {}
for i in set(df['cluster']):
    label_counts = df[df['cluster'] == i]['label'].value_counts()
    predominant_label = label_counts.idxmax()
    cluster_mapping[i] = predominant_label

df['predicted_label'] = df['cluster'].map(cluster_mapping)

# Step 5: Confusion Matrix
conf_matrix = confusion_matrix(df['label'], df['predicted_label'], labels=['CG', 'OR'])

# Display the Confusion Matrix
print("Confusion Matrix:")
print(conf_matrix)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/malfridurannaeiriksdottir/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Confusion Matrix:
[[ 6863 13353]
 [ 4417 15799]]


In [7]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import MiniBatchKMeans
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from textblob import TextBlob
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Assuming `df` is your DataFrame after clustering with the predicted labels
# Load your dataset
df = pd.read_csv("../data/fake reviews dataset.csv")

# Step 1: Feature Engineering (add the previous features here)
df['text_length'] = df['text_'].apply(len)
df['sentiment_score'] = df['text_'].apply(lambda text: TextBlob(text).sentiment.polarity)
df['unique_word_ratio'] = df['text_'].apply(lambda x: len(set(x.split())) / len(x.split()))
df['average_word_length'] = df['text_'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))
df['punctuation_density'] = df['text_'].apply(lambda x: sum(1 for char in x if char in string.punctuation) / len(x))
df['stopword_ratio'] = df['text_'].apply(lambda x: sum(1 for word in x.split() if word.lower() in stop_words) / len(x.split()))
df['uppercase_word_ratio'] = df['text_'].apply(lambda x: sum(1 for word in x.split() if word.isupper()) / len(x.split()))
df['sentence_count'] = df['text_'].apply(lambda x: len(x.split('.')))
df['average_sentence_length'] = df['text_'].apply(lambda x: len(x.split()) / (len(x.split('.')) + 1))  # +1 to avoid division by zero

# TF-IDF Vectorization for text features
vectorizer = TfidfVectorizer(max_features=50)
tfidf_features = vectorizer.fit_transform(df['text_']).toarray()
tfidf_df = pd.DataFrame(tfidf_features, columns=[f'tfidf_{i}' for i in range(tfidf_features.shape[1])])
df = pd.concat([df, tfidf_df], axis=1)

# Select features for clustering
features = df[['text_length', 'rating', 'sentiment_score', 'unique_word_ratio', 'average_word_length',
               'punctuation_density', 'stopword_ratio', 'uppercase_word_ratio', 'sentence_count',
               'average_sentence_length'] + list(tfidf_df.columns)]

# Step 2: Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Optional Step: Dimensionality Reduction with PCA
pca = PCA(n_components=10)
reduced_features = pca.fit_transform(scaled_features)

# Step 3: MiniBatchKMeans Clustering
minibatch_kmeans = MiniBatchKMeans(n_clusters=3, batch_size=1000, random_state=42)
clusters = minibatch_kmeans.fit_predict(reduced_features)

# Map clusters to predominant labels
df['cluster'] = clusters
cluster_mapping = {}
for i in set(df['cluster']):
    label_counts = df[df['cluster'] == i]['label'].value_counts()
    predominant_label = label_counts.idxmax()
    cluster_mapping[i] = predominant_label

df['predicted_label'] = df['cluster'].map(cluster_mapping)

# Step 5: Confusion Matrix with Percentages
conf_matrix = confusion_matrix(df['label'], df['predicted_label'], labels=['CG', 'OR'])
conf_matrix_percent = (conf_matrix / conf_matrix.sum(axis=1, keepdims=True)) * 100

# Display the Confusion Matrix with Percentages
conf_matrix_df = pd.DataFrame(conf_matrix_percent, index=['Actual CG', 'Actual OR'], columns=['Predicted CG', 'Predicted OR'])
print("Confusion Matrix (Percentages):")
print(conf_matrix_df)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/malfridurannaeiriksdottir/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Confusion Matrix (Percentages):
           Predicted CG  Predicted OR
Actual CG     59.621092     40.378908
Actual OR     49.767511     50.232489


In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from textblob import TextBlob
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load your dataset
df = pd.read_csv("../data/fake reviews dataset.csv")

# Step 1: Feature Engineering (repeating previous features)
df['text_length'] = df['text_'].apply(len)
df['sentiment_score'] = df['text_'].apply(lambda text: TextBlob(text).sentiment.polarity)
df['unique_word_ratio'] = df['text_'].apply(lambda x: len(set(x.split())) / len(x.split()))
df['average_word_length'] = df['text_'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))
df['punctuation_density'] = df['text_'].apply(lambda x: sum(1 for char in x if char in string.punctuation) / len(x))
df['stopword_ratio'] = df['text_'].apply(lambda x: sum(1 for word in x.split() if word.lower() in stop_words) / len(x.split()))
df['uppercase_word_ratio'] = df['text_'].apply(lambda x: sum(1 for word in x.split() if word.isupper()) / len(x.split()))
df['sentence_count'] = df['text_'].apply(lambda x: len(x.split('.')))
df['average_sentence_length'] = df['text_'].apply(lambda x: len(x.split()) / (len(x.split('.')) + 1))  # +1 to avoid division by zero

# TF-IDF Vectorization for text features
vectorizer = TfidfVectorizer(max_features=50)
tfidf_features = vectorizer.fit_transform(df['text_']).toarray()
tfidf_df = pd.DataFrame(tfidf_features, columns=[f'tfidf_{i}' for i in range(tfidf_features.shape[1])])
df = pd.concat([df, tfidf_df], axis=1)

# Select features for clustering
features = df[['text_length', 'rating', 'sentiment_score', 'unique_word_ratio', 'average_word_length',
               'punctuation_density', 'stopword_ratio', 'uppercase_word_ratio', 'sentence_count',
               'average_sentence_length'] + list(tfidf_df.columns)]

# Step 2: Standardize the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(features)

# Optional Step: Dimensionality Reduction with PCA
pca = PCA(n_components=10)
reduced_features = pca.fit_transform(scaled_features)

# Step 3: Gaussian Mixture Model Clustering
gmm = GaussianMixture(n_components=2, covariance_type='tied', random_state=42)
clusters = gmm.fit_predict(reduced_features)

# Step 4: Map clusters to predominant labels
df['cluster'] = clusters
cluster_mapping = {}
for i in set(df['cluster']):
    label_counts = df[df['cluster'] == i]['label'].value_counts()
    predominant_label = label_counts.idxmax()
    cluster_mapping[i] = predominant_label

df['predicted_label'] = df['cluster'].map(cluster_mapping)

# Step 5: Confusion Matrix and Percentages
conf_matrix = confusion_matrix(df['label'], df['predicted_label'], labels=['CG', 'OR'])
conf_matrix_percent = (conf_matrix / conf_matrix.sum(axis=1, keepdims=True)) * 100

# Display the Confusion Matrix with Percentages
conf_matrix_df = pd.DataFrame(conf_matrix_percent, index=['Actual CG', 'Actual OR'], columns=['Predicted CG', 'Predicted OR'])
print("Confusion Matrix (Percentages):")
print(conf_matrix_df)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/malfridurannaeiriksdottir/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Confusion Matrix (Percentages):
           Predicted CG  Predicted OR
Actual CG     83.325089     16.674911
Actual OR     79.387614     20.612386


In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from textblob import TextBlob
import string
import nltk
from sentence_transformers import SentenceTransformer

from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Load your dataset
df = pd.read_csv("../data/fake reviews dataset.csv")

# Step 1: Feature Engineering (same as before)
df['text_length'] = df['text_'].apply(len)
df['sentiment_score'] = df['text_'].apply(lambda text: TextBlob(text).sentiment.polarity)
df['unique_word_ratio'] = df['text_'].apply(lambda x: len(set(x.split())) / len(x.split()))
df['average_word_length'] = df['text_'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))
df['punctuation_density'] = df['text_'].apply(lambda x: sum(1 for char in x if char in string.punctuation) / len(x))
df['stopword_ratio'] = df['text_'].apply(lambda x: sum(1 for word in x.split() if word.lower() in stop_words) / len(x.split()))
df['uppercase_word_ratio'] = df['text_'].apply(lambda x: sum(1 for word in x.split() if word.isupper()) / len(x.split()))
df['sentence_count'] = df['text_'].apply(lambda x: len(x.split('.')))
df['average_sentence_length'] = df['text_'].apply(lambda x: len(x.split()) / (len(x.split('.')) + 1))

# TF-IDF Vectorization for text features
vectorizer = TfidfVectorizer(max_features=50)
tfidf_features = vectorizer.fit_transform(df['text_']).toarray()
tfidf_df = pd.DataFrame(tfidf_features, columns=[f'tfidf_{i}' for i in range(tfidf_features.shape[1])])
df = pd.concat([df, tfidf_df], axis=1)

# Select features for clustering
features = df[['text_length', 'rating', 'sentiment_score', 'unique_word_ratio', 'average_word_length',
               'punctuation_density', 'stopword_ratio', 'uppercase_word_ratio', 'sentence_count',
               'average_sentence_length'] + list(tfidf_df.columns)]

# Step 2: Feature Selection with Random Forest
# We'll use a labeled subset for feature selection
subset_df = df[['label'] + features.columns.tolist()].dropna()
X_subset = subset_df[features.columns]
y_subset = subset_df['label']

# Train Random Forest on the subset
rf = RandomForestClassifier(random_state=42)
rf.fit(X_subset, y_subset)

# Select top features based on importance
feature_importances = pd.Series(rf.feature_importances_, index=features.columns)
top_features = feature_importances.nlargest(10).index.tolist()  # Select top 10 features

# Filter features for clustering
selected_features = df[top_features]

# Step 3: Standardize the selected features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(selected_features)

# Optional Step: Dimensionality Reduction with PCA (optional based on feature count)
pca = PCA(n_components=5)
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
bert_embeddings = model.encode(df['text_'].tolist())
reduced_features = PCA(n_components=10).fit_transform(bert_embeddings)
# Step 4: Gaussian Mixture Model Clustering on Selected Features
gmm = GaussianMixture(n_components=3, random_state=42)
clusters = gmm.fit_predict(reduced_features)

# Step 5: Map clusters to predominant labels
df['cluster'] = clusters
cluster_mapping = {}
for i in set(df['cluster']):
    label_counts = df[df['cluster'] == i]['label'].value_counts()
    predominant_label = label_counts.idxmax()
    cluster_mapping[i] = predominant_label

df['predicted_label'] = df['cluster'].map(cluster_mapping)

# Step 6: Confusion Matrix with Percentages
conf_matrix = confusion_matrix(df['label'], df['predicted_label'], labels=['CG', 'OR'])
conf_matrix_percent = (conf_matrix / conf_matrix.sum(axis=1, keepdims=True)) * 100

# Display the Confusion Matrix with Percentages
conf_matrix_df = pd.DataFrame(conf_matrix_percent, index=['Actual CG', 'Actual OR'], columns=['Predicted CG', 'Predicted OR'])
print("Confusion Matrix (Percentages):")
print(conf_matrix_df)

# Display Selected Top Features
print("Top Features Selected for Clustering:")
print(top_features)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/malfridurannaeiriksdottir/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


ConnectionError: (ProtocolError('Connection aborted.', TimeoutError(60, 'Operation timed out')), '(Request ID: fa1f8ca0-8047-4660-aff8-71cef28df962)')

In [6]:
pip install transformers==4.25.1 sentence-transformers==2.2.2


Collecting transformers==4.25.1
  Downloading transformers-4.25.1-py3-none-any.whl.metadata (93 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.9/93.9 kB[0m [31m384.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting sentence-transformers==2.2.2
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.25.1)
  Using cached tokenizers-0.13.3-cp39-cp39-macosx_12_0_arm64.whl.metadata (6.7 kB)
Downloading transformers-4.25.1-py3-none-any.whl (5.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.8/5.8 MB[0m [31m733.2 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hUsing cached tokenizers-0.13.3-cp39-cp39-macosx_12_0_arm64.whl (3.9 MB)
Building wheels for collected packages:

In [4]:
pip install -U transformers sentence-transformers


Collecting transformers
  Using cached transformers-4.46.2-py3-none-any.whl.metadata (44 kB)
Collecting tokenizers<0.21,>=0.20 (from transformers)
  Downloading tokenizers-0.20.3-cp39-cp39-macosx_11_0_arm64.whl.metadata (6.7 kB)
Using cached transformers-4.46.2-py3-none-any.whl (10.0 MB)
Downloading tokenizers-0.20.3-cp39-cp39-macosx_11_0_arm64.whl (2.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.6/2.6 MB[0m [31m438.9 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenizers
    Found existing installation: tokenizers 0.13.3
    Uninstalling tokenizers-0.13.3:
      Successfully uninstalled tokenizers-0.13.3
  Attempting uninstall: transformers
    Found existing installation: transformers 4.33.1
    Uninstalling transformers-4.33.1:
      Successfully uninstalled transformers-4.33.1
Successfully installed tokenizers-0.20.3 transformers-4.46.2
Note: you may need to restart t

In [2]:
pip install tensorflow==2.14.1 transformers==4.33.1


Collecting tensorflow==2.14.1
  Downloading tensorflow-2.14.1-cp39-cp39-macosx_12_0_arm64.whl.metadata (3.3 kB)
Collecting transformers==4.33.1
  Downloading transformers-4.33.1-py3-none-any.whl.metadata (119 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting tensorflow-macos==2.14.1 (from tensorflow==2.14.1)
  Downloading tensorflow_macos-2.14.1-cp39-cp39-macosx_12_0_arm64.whl.metadata (3.9 kB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers==4.33.1)
  Downloading tokenizers-0.13.3-cp39-cp39-macosx_12_0_arm64.whl.metadata (6.7 kB)
Collecting ml-dtypes==0.2.0 (from tensorflow-macos==2.14.1->tensorflow==2.14.1)
  Downloading ml_dtypes-0.2.0-cp39-cp39-macosx_10_9_universal2.whl.metadata (20 kB)
Collecting wrapt<1.15,>=1.11.0 (from tensorflow-macos==2.14.1->tensorflow==2.14.1)
  Downloading wrapt-1.14.1-cp39-cp39-macosx_11_0_arm64.whl.metadata (6.7 kB)
Collecting te

In [4]:
df = pd.read_csv("../data/fake reviews dataset.csv")


In [5]:
df.head()

Unnamed: 0,category,rating,label,text_
0,Home_and_Kitchen_5,5,CG,"Love this! Well made, sturdy, and very comfor..."
1,Home_and_Kitchen_5,5,CG,"love it, a great upgrade from the original. I..."
2,Home_and_Kitchen_5,5,CG,This pillow saved my back. I love the look and...
3,Home_and_Kitchen_5,1,CG,"Missing information on how to use it, but it i..."
4,Home_and_Kitchen_5,5,CG,Very nice set. Good quality. We have had the s...
