In [1]:
import sklearn
import pandas as pd
import numpy as np
import string
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
import re

In [2]:
df = pd.read_csv("crypto_currency_sentiment_dataset.csv")
print("Dataset Loaded!\n", df.head())

## SBert -- PCA -- XGB
def clean_text(text):
    text = re.sub(r"http\\S+|www\\S+", "", text)
    text = re.sub(r"@[A-Za-z0-9]+", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

# stop_words = set(stopwords.words('english'))
# lemmatizer = WordNetLemmatizer()

# def clean_text(text):
#     text = text.lower()  # Lowercase
#     text = re.sub(r"http\\S+|www\\S+", "", text)  # Remove URLs
#     text = re.sub(r"[^a-zA-Z0-9\s]", "", text)  # Remove special characters
#     text = re.sub(r"\s+", " ", text).strip()
#     text = " ".join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])  # Lemmatize & remove stopwords
#     return text

df["Clean_Comment"] = df["Comment"].astype(str).apply(clean_text)

df["Label"] = df["Sentiment"].map({"Positive": 1, "Negative": 0})

print("Dataset Transformed!\n", df.head())

Dataset Loaded!
         user_id                                            Comment Sentiment  \
0  XYNN2Y4VCF3G  I bought 2200 at the ico, at 0.50$ per coin. H...  Positive   
1  DR6XNZMT9KRH  Harmony one , algorand , Cardano, solana , vec...  Positive   
2  9FCQGMYD4A42  Honestly, after reading this post and many of ...  Negative   
3  QEZAEMV2WF9D  In bear market is where money is made. I Will ...  Positive   
4  Z7J7W3XCP4XC  Funny how people think Bitcoin's risk is compa...  Negative   

                                          Reddit URL  
0  https://www.reddit.com/r/Avax/comments/uzggar/...  
1  https://www.reddit.com/r/CryptoCurrency/commen...  
2  https://www.reddit.com/r/CryptoCurrency/commen...  
3  https://www.reddit.com/r/CryptoCurrency/commen...  
4  https://www.reddit.com/r/investing/comments/um...  
Dataset Transformed!
         user_id                                            Comment Sentiment  \
0  XYNN2Y4VCF3G  I bought 2200 at the ico, at 0.50$ per coin. H...  Po

In [None]:
tfidf = TfidfVectorizer(max_features=10_000, ngram_range=(1, 4), sublinear_tf=True, stop_words='english')
X = tfidf.fit_transform(df["Clean_Comment"])
y = df["Label"]

print(X.shape)
print(y.shape)

pca = PCA(n_components=512, svd_solver='auto')
X = pca.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

print(X.shape)
print(y.shape)

In [None]:
import gensim.downloader as api

w2v_model = api.load("word2vec-google-news-300")

def comment_to_vec(comment, model, vector_size=300):
    words = comment.split()
    word_vectors = [model[word] for word in words if word in model]
    if len(word_vectors) > 0:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(vector_size)  # Fallback for empty vectors

df["Word2Vec_Feature"] = df["Clean_Comment"].apply(lambda x: comment_to_vec(x, w2v_model))

X = np.vstack(df["Word2Vec_Feature"].values)
y = df["Label"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [3]:
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import PCA

sbert_model = SentenceTransformer('./MiniLM_l6_v2')
X = np.array([sbert_model.encode(text) for text in df["Clean_Comment"]])
y = df["Label"].values

pca = PCA(n_components=32) 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

print(sbert_model.encode("How are you").shape)

print(X.shape)
print(y.shape)

print(X_train.shape)
print(X_test.shape)

  from tqdm.autonotebook import tqdm, trange
  attn_output = torch.nn.functional.scaled_dot_product_attention(


(384,)
(562, 32)
(562,)


In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

def train_and_evaluate(model, model_name):
    model.fit(X_train, y_train)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    print(f"{model_name} Cross-Validation Accuracy: {cv_scores.mean():.4f}")
    
    y_pred = model.predict(X_test)
    print(f"\n{model_name} Performance on Test Set:")
    print(classification_report(y_test, y_pred))
    
    acc_score = accuracy_score(y_test, y_pred)
    print(f"{model_name} Accuracy: {acc_score:.4f}\n")
    return model

In [7]:
# log_reg = LogisticRegression(max_iter=1000, C=1.0, class_weight='balanced')
log_reg = LogisticRegression(max_iter=1000)
train_and_evaluate(log_reg, "Logistic Regression")

Logistic Regression Cross-Validation Accuracy: 0.8238

Logistic Regression Performance on Test Set:
              precision    recall  f1-score   support

           0       0.92      0.85      0.88        26
           1       0.88      0.94      0.91        31

    accuracy                           0.89        57
   macro avg       0.90      0.89      0.89        57
weighted avg       0.90      0.89      0.89        57

Logistic Regression Accuracy: 0.8947



In [8]:
# xgb = XGBClassifier(n_estimators=300, learning_rate=0.01, max_depth=6, random_state=42)
# train_and_evaluate(xgb, "XGBoost")

from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV

params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "learning_rate": [0.01, 0.05, 0.1, 0.2]
}

xgb = XGBClassifier()
grid = GridSearchCV(xgb, params, cv=3, scoring="accuracy", n_jobs=-1)
grid.fit(X_train, y_train)

best_xgb = grid.best_estimator_
train_and_evaluate(best_xgb, "Optimized XGBoost")

print("Best Hyperparameters:", grid.best_params_)
print("Best Cross-Validation Accuracy:", grid.best_score_)

Optimized XGBoost Cross-Validation Accuracy: 0.8000

Optimized XGBoost Performance on Test Set:
              precision    recall  f1-score   support

           0       1.00      0.85      0.92        26
           1       0.89      1.00      0.94        31

    accuracy                           0.93        57
   macro avg       0.94      0.92      0.93        57
weighted avg       0.94      0.93      0.93        57

Optimized XGBoost Accuracy: 0.9298

Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 6, 'n_estimators': 300}
Best Cross-Validation Accuracy: 0.7940734479196019


In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

params = {
    "n_estimators": [100, 200, 300],
    "max_depth": [4, 6, 8],
    "learning_rate": [0.01, 0.05, 0.1, 0.2]
}

xgb = XGBClassifier()

random_search = RandomizedSearchCV(
    xgb, params, n_iter=30, cv=3, scoring="accuracy", n_jobs=-1
)
random_search.fit(X_train, y_train)

best_xgb = random_search.best_estimator_

train_and_evaluate(best_xgb, "Randomized Search Optimized XGBoost")

print("Best Hyperparameters:", random_search.best_params_)
print("Best Cross-Validation Accuracy:", random_search.best_score_)


In [5]:
xgb = XGBClassifier(learning_rate=0.1, max_depth=8, n_estimators=300)
train_and_evaluate(xgb, "XGBoost Classifier")

XGBoost Classifier Cross-Validation Accuracy: 0.7921

XGBoost Classifier Performance on Test Set:
              precision    recall  f1-score   support

           0       0.92      0.85      0.88        26
           1       0.88      0.94      0.91        31

    accuracy                           0.89        57
   macro avg       0.90      0.89      0.89        57
weighted avg       0.90      0.89      0.89        57

XGBoost Classifier Accuracy: 0.8947



In [17]:
xgb = XGBClassifier(learning_rate=0.2, max_depth=6, n_estimators=300)
xgb = train_and_evaluate(xgb, "XGBoost Classifier")

XGBoost Classifier Cross-Validation Accuracy: 0.8000

XGBoost Classifier Performance on Test Set:
              precision    recall  f1-score   support

           0       1.00      0.85      0.92        26
           1       0.89      1.00      0.94        31

    accuracy                           0.93        57
   macro avg       0.94      0.92      0.93        57
weighted avg       0.94      0.93      0.93        57

XGBoost Classifier Accuracy: 0.9298



In [10]:
import joblib

joblib.dump(xgb, "xgboost_model_9298.pkl")

['xgboost_model_9298.pkl']

In [14]:
from sklearn.metrics import precision_score, recall_score, f1_score

# Get predictions for the test set using the best XGBoost model
y_pred = best_xgb.predict(X_test)

# Calculate precision, recall, and F1-score explicitly
precision = precision_score(y_test, y_pred, average='weighted')  # Use 'macro' or 'weighted' for multi-class
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Precision: 0.9378
Recall: 0.9298
F1 Score: 0.9290


In [15]:
best_xgb.save_model("xgboost_model_9298.json")

In [None]:
import joblib

joblib.dump(xgb, "xgboost_model_9123.pkl")

In [None]:
rf = RandomForestClassifier(n_estimators=300, random_state=42)
train_and_evaluate(rf, "Random Forest")

In [6]:
from sklearn.metrics import precision_score, recall_score, f1_score
from xgboost import XGBClassifier

loaded_xgb = XGBClassifier()
loaded_xgb.load_model("xgboost_model_9298.json")

y_pred = loaded_xgb.predict(X_test)

precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Precision: 0.9316
Recall: 0.9298
F1 Score: 0.9295


In [None]:
#categ_env_TF_cpu  -- python 3.9.8

# xgboost_model_9298.json -- XGBClassifier(learning_rate=0.2, max_depth=6, n_estimators=300)