In [1]:
import sklearn
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import nltk
import re
from sklearn.cluster import KMeans

In [24]:
df = pd.read_csv("crypto_currency_sentiment_dataset.csv")
print("Dataset Loaded!\n", df.head())

## SBert -- PCA -- XGB
def clean_text(text):
    text = re.sub(r"http\\S+|www\\S+", "", text)
    text = re.sub(r"@[A-Za-z0-9]+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["Clean_Comment"] = df["Comment"].astype(str).apply(clean_text)

df["Label"] = df["Sentiment"].map({"Positive": 1, "Negative": 0})

print("Dataset Transformed!\n", df.head())

Dataset Loaded!
         user_id                                            Comment Sentiment  \
0  XYNN2Y4VCF3G  I bought 2200 at the ico, at 0.50$ per coin. H...  Positive   
1  DR6XNZMT9KRH  Harmony one , algorand , Cardano, solana , vec...  Positive   
2  9FCQGMYD4A42  Honestly, after reading this post and many of ...  Negative   
3  QEZAEMV2WF9D  In bear market is where money is made. I Will ...  Positive   
4  Z7J7W3XCP4XC  Funny how people think Bitcoin's risk is compa...  Negative   

                                          Reddit URL  
0  https://www.reddit.com/r/Avax/comments/uzggar/...  
1  https://www.reddit.com/r/CryptoCurrency/commen...  
2  https://www.reddit.com/r/CryptoCurrency/commen...  
3  https://www.reddit.com/r/CryptoCurrency/commen...  
4  https://www.reddit.com/r/investing/comments/um...  
Dataset Transformed!
         user_id                                            Comment Sentiment  \
0  XYNN2Y4VCF3G  I bought 2200 at the ico, at 0.50$ per coin. H...  Po

In [32]:
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA
from sklearn.cluster import DBSCAN

sbert_model = SentenceTransformer('./MiniLM_l6_v2')
X = np.array([sbert_model.encode(text) for text in df["Clean_Comment"]])
y = df["Label"].values

pca = PCA(n_components=256)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

print(sbert_model.encode("How are you").shape)

print(X.shape)
print(y.shape)

print(X_train.shape)
print(X_test.shape)

(384,)
(562, 384)
(562,)
(505, 256)
(57, 256)


In [33]:
algo = KMeans(n_clusters = 2)

algo.fit(X_train)

y_pred = algo.predict(X_test)

In [34]:
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.6316
Precision: 0.6306
Recall: 0.6316
F1 Score: 0.6309
