### test

In [None]:
import numpy as np                                          #For statistics, array creation and manipulation
import pandas as pd                                         #For data loading and data manipulation
import seaborn as sns                                       #For data visualization
from wordcloud import WordCloud                             #For visualizing the summarized textual data
import matplotlib.pyplot as plt                             #For data visualization - %matplotlib below is a magic command to render matplotlib in jupyter
%matplotlib inline

import nltk                                                 #For natural language processing
from nltk.corpus import stopwords                           #For stopwords processing
from nltk.tokenize import TreebankWordTokenizer             #For splitting sentences into words
from nltk import SnowballStemmer, PorterStemmer
from nltk.stem import WordNetLemmatizer                     #For reducing words to its base form
from sklearn.feature_extraction.text import CountVectorizer #For mapping/converting words into real numbers
from sklearn.model_selection import train_test_split        #For splitting the dataset into train and test
from textblob import TextBlob

from sklearn.metrics import classification_report, f1_score, accuracy_score #For evaluating model preformance
import string
import re                                               #For data cleaning to remove/replace unwanted patterns
import pickle

# Download NLTK data (run this once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')

pd.set_option('max_colwidth', 800)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
df = pd.read_csv("/content/articlesData.csv")

In [None]:
df.shape

(26751, 12)

In [None]:
df.columns

Index(['Unnamed: 0', '_id', 'id', 'Domain', 'Source', 'Title', 'Date', 'Slug',
       'Currencies', 'URL', 'Created_at', 'Language'],
      dtype='object')

In [None]:
## select only english news headlines
df_n = df[df['Language'] == 'en']
df_n = df_n[:5000]  ## select first 5000 rows

In [None]:
df_n.reset_index()

Unnamed: 0.1,index,Unnamed: 0,_id,id,Domain,Source,Title,Date,Slug,Currencies,URL,Created_at,Language
0,20,20,6370cd069a9af76fe9539f46,16884976,u.today,U.Today,Binance CEO Says Why You Should Avoid Exchanges That Do This,2022-11-13T10:31:00Z,Binance-CEO-Says-Why-You-Should-Avoid-Exchanges-That-Do-This,,https://cryptopanic.com/news/16884976/Binance-CEO-Says-Why-You-Should-Avoid-Exchanges-That-Do-This,2022-11-13T10:31:00Z,en
1,21,21,6370cd069a9af76fe9539f47,16884983,beincrypto.com,BeInCrypto,Bitcoin (BTC) Miners Sell Most in 5 Years,2022-11-13T10:26:03Z,Bitcoin-BTC-Miners-Sell-Most-in-5-Years,Bitcoin,https://cryptopanic.com/news/16884983/Bitcoin-BTC-Miners-Sell-Most-in-5-Years,2022-11-13T10:26:03Z,en
2,22,22,6370cd069a9af76fe9539f48,16884964,cryptoglobe.com,CryptoGlobe,Binance CEO: Deal To Acquire FTX.com ‘Did Not Make Sense From a Number of Fronts’,2022-11-13T10:20:00Z,Binance-CEO-Deal-To-Acquire-FTXcom-Did-Not-Make-Sense-From-a-Number-of-Fronts,FTX,https://cryptopanic.com/news/16884964/Binance-CEO-Deal-To-Acquire-FTXcom-Did-Not-Make-Sense-From-a-Number-of-Fronts,2022-11-13T10:20:00Z,en
3,23,23,6370cd069a9af76fe9539f49,16884948,dailyhodl.com,The Daily Hodl,"Top Crypto Strategist Warns Dogecoin and Polygon Holders, Says DOGE and MATIC Could Witness Drastic Corrections",2022-11-13T10:15:11Z,Top-Crypto-Strategist-Warns-Dogecoin-and-Polygon-Holders-Says-DOGE-and-MATIC-Could-Witness-Drastic-Corrections,Bitcoin Dogecoin Polygon,https://cryptopanic.com/news/16884948/Top-Crypto-Strategist-Warns-Dogecoin-and-Polygon-Holders-Says-DOGE-and-MATIC-Could-Witness-Drastic-Corrections,2022-11-13T10:15:11Z,en
4,24,24,6370cd069a9af76fe9539f4a,16884786,u.today,U.Today,"Shiba Inu Returns to Bottom Again, What's Next?",2022-11-13T10:00:00Z,Shiba-Inu-Returns-to-Bottom-Again-Whats-Next,Shiba Inu,https://cryptopanic.com/news/16884786/Shiba-Inu-Returns-to-Bottom-Again-Whats-Next,2022-11-13T10:00:00Z,en
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,5741,5741,63910280b73c753d0c615b7d,17094996,youtube.com,The Crypto Lark,Only 7.9% Think Bitcoin Can…,2022-12-07T21:00:07Z,Only-79-Think-Bitcoin-Can,Bitcoin,https://cryptopanic.com/news/17094996/Only-79-Think-Bitcoin-Can,2022-12-07T21:00:07Z,en
4996,5742,5742,63910280b73c753d0c615b7e,17094954,cointelegraph.com,CoinTelegraph,"Crypto consumer protection, proof of reserves bills introduced into US Congress",2022-12-07T21:00:00Z,Crypto-consumer-protection-proof-of-reserves-bills-introduced-into-US-Congress,,https://cryptopanic.com/news/17094954/Crypto-consumer-protection-proof-of-reserves-bills-introduced-into-US-Congress,2022-12-07T21:00:00Z,en
4997,5743,5743,63910280b73c753d0c615b7f,17094961,theblockcrypto.com,The Block,Signature Bank adopts caps to reduce concentration of crypto deposits,2022-12-07T20:58:22Z,Signature-Bank-adopts-caps-to-reduce-concentration-of-crypto-deposits,,https://cryptopanic.com/news/17094961/Signature-Bank-adopts-caps-to-reduce-concentration-of-crypto-deposits,2022-12-07T20:58:22Z,en
4998,5744,5744,63910280b73c753d0c615b80,17094963,dailyhodl.com,The Daily Hodl,"Michael Saylor Says SEC Should Shut Down XRP, Ethereum, Solana and Other Altcoins for Being Unregistered Securities",2022-12-07T20:55:37Z,Michael-Saylor-Says-SEC-Should-Shut-Down-XRP-Ethereum-Solana-and-Other-Altcoins-for-Being-Unregistered-Securities,Bitcoin Ethereum XRP Solana,https://cryptopanic.com/news/17094963/Michael-Saylor-Says-SEC-Should-Shut-Down-XRP-Ethereum-Solana-and-Other-Altcoins-for-Being-Unregistered-Securities,2022-12-07T20:55:37Z,en


In [None]:
df_n.shape

(5000, 12)

In [None]:
df_n.Source.nunique()

64

In [None]:
df_n.Currencies.nunique()

410

In [None]:
df_n.Currencies.unique()

array([nan, 'Bitcoin ', 'FTX ', 'Bitcoin Dogecoin Polygon ', 'Shiba Inu ',
       'Bitcoin Solana ', 'Tether Huobi ', 'Ethereum Gate ',
       'Bitcoin Ethereum ', 'TRON ', 'Ethereum Huobi ', 'Bitcoin FTX ',
       'Dogecoin ', 'XRP ', 'Solana ', 'Ethereum ', 'Bitcoin Cash ',
       'BNB Chain OEC Binance Coin ', 'Helium ', 'Ethereum FTX ',
       'Huobi ', 'Cardano ', 'Ethereum ApeCoin ', 'Ethereum Cardano FTX ',
       'Polygon ', 'Uniswap ', 'Bitcoin Ethereum FTX ', 'Cronos ',
       'OEC Binance Coin ETHDOWN ETHUP BNB Heco-Peg Binance Coin ',
       'Bitcoin Bitcoin Cash Wrapped Bitcoin (Sollet) Cronos ',
       'Chainlink FTX ', 'MCO Cronos Wrapped CRO MCO ',
       'Bitcoin Dogecoin ', 'Chiliz FTX ', 'Bitcoin Cronos ',
       'Bitcoin Ethereum Solana ', 'Axie Infinity ', 'Huobi FTX ',
       'Market Cap Bitcoin ', 'Cronos Terra Luna Classic FTX ',
       'Cardano FTX ', 'Terra Luna Classic ', 'Maker ',
       'Ethereum BNB FTX ', 'Hedera ', 'Ethereum XRP Cardano ',
       'Avalan

In [None]:
df_n.Title.head()

Unnamed: 0,Title
20,Binance CEO Says Why You Should Avoid Exchanges That Do This
21,Bitcoin (BTC) Miners Sell Most in 5 Years
22,Binance CEO: Deal To Acquire FTX.com ‘Did Not Make Sense From a Number of Fronts’
23,"Top Crypto Strategist Warns Dogecoin and Polygon Holders, Says DOGE and MATIC Could Witness Drastic Corrections"
24,"Shiba Inu Returns to Bottom Again, What's Next?"


In [None]:
# df_n['Title'] = df_n['Title'].fillna('').astype(str)

In [None]:
df_n.isna().sum()

Unnamed: 0,0
Unnamed: 0,0
_id,0
id,0
Domain,0
Source,0
Title,0
Date,0
Slug,0
Currencies,1771
URL,0


### Clean and tokenize the data

In [None]:

def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove symbols (except alphanumeric)
    text = re.sub(r'[^\w]', ' ', text)
    # Remove numbers
    text = ''.join([i for i in text if not i.isdigit()])
    ## remove single characters eg 'k'
    text = re.sub(r"\b\w\b", "", text)
    # Remove extra whitespace
    text = ' '.join(text.split())
    # Remove special characters
    text = re.sub(r'\W', ' ', text)
    # Tokenization
    tokens = nltk.word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    # lemmatizer = WordNetLemmatizer()
    # tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return tokens

def preprocess_dataframe(df, text_column, new_column):
    df[new_column] = df[text_column].apply(preprocess_text)
    return df


In [None]:
df1 = preprocess_dataframe(df_n.copy(), 'Title','clean_title')
df1.head(1)

Unnamed: 0.1,Unnamed: 0,_id,id,Domain,Source,Title,Date,Slug,Currencies,URL,Created_at,Language,clean_title
20,20,6370cd069a9af76fe9539f46,16884976,u.today,U.Today,Binance CEO Says Why You Should Avoid Exchanges That Do This,2022-11-13T10:31:00Z,Binance-CEO-Says-Why-You-Should-Avoid-Exchanges-That-Do-This,,https://cryptopanic.com/news/16884976/Binance-CEO-Says-Why-You-Should-Avoid-Exchanges-That-Do-This,2022-11-13T10:31:00Z,en,"[binance, ceo, says, avoid, exchanges]"


In [None]:
def join_tokens(tokens):
    # tokens contain a list of words
    return ' '.join(tokens)

df1['clean_title'] = df1['clean_title'].apply(join_tokens)
df1.head(1)

Unnamed: 0.1,Unnamed: 0,_id,id,Domain,Source,Title,Date,Slug,Currencies,URL,Created_at,Language,clean_title
20,20,6370cd069a9af76fe9539f46,16884976,u.today,U.Today,Binance CEO Says Why You Should Avoid Exchanges That Do This,2022-11-13T10:31:00Z,Binance-CEO-Says-Why-You-Should-Avoid-Exchanges-That-Do-This,,https://cryptopanic.com/news/16884976/Binance-CEO-Says-Why-You-Should-Avoid-Exchanges-That-Do-This,2022-11-13T10:31:00Z,en,binance ceo says avoid exchanges


to do
1. assign sentiments to the rows .. use only 5k rows **
2. train distilbert model **
3. test using train_test split **
4. do streamlit

In [None]:
## use textblob to assign sentiments to headlines
from textblob import TextBlob

def assign_sentiment_category(df, column_name):

    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' not found in the DataFrame.")

    def categorize_sentiment(text):
        polarity = TextBlob(text).sentiment.polarity
        if polarity > 0:
            return "Positive"
        elif polarity < 0:
            return "Negative"
        else:
            return "Neutral"

    # Apply categorization
    df['sentiment'] = df[column_name].apply(categorize_sentiment)
    return df


In [None]:
df1.columns

Index(['Unnamed: 0', '_id', 'id', 'Domain', 'Source', 'Title', 'Date', 'Slug',
       'Currencies', 'URL', 'Created_at', 'Language', 'clean_title'],
      dtype='object')

In [None]:
df1 = df1[['clean_title']] ## reduce dataframe to only needed column before calling the function

In [None]:
df1 = assign_sentiment_category(df1, 'clean_title')
# print(df)

In [None]:
df1.head()

Unnamed: 0,clean_title,sentiment
20,binance ceo says avoid exchanges,Neutral
21,bitcoin btc miners sell years,Neutral
22,binance ceo deal acquire ftxcom make sense number fronts,Neutral
23,top crypto strategist warns dogecoin polygon holders says doge matic could witness drastic corrections,Positive
24,shiba inu returns bottom whats next,Neutral


In [None]:
df1.sentiment.value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
Neutral,3083
Positive,1192
Negative,725


In [None]:
df1.to_csv('sentiment_data.csv', index= False)

### train the distilbert model for sentiment classification

In [None]:
!pip install transformers torch accelerate datasets -U --quiet ## might be needed just once if errors are encountered

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/480.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m18.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver 

In [None]:
df1 = pd.read_csv('/content/sentiment_data.csv')
df1.head(1)

Unnamed: 0,clean_title,sentiment
0,binance ceo says avoid exchanges,Neutral


In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

# Map sentiment to numeric labels
label_map = {'Negative': 0, 'Neutral': 1, 'Positive': 2}
df1['Label'] = df1['sentiment'].map(label_map)

# Split dataset into train, validation, and test sets
train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df1['clean_title'], df1['Label'], test_size=0.3, random_state=42
)
val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42
)

# Convert to Hugging Face Dataset
train_data = Dataset.from_dict({'text': train_texts.tolist(), 'label': train_labels.tolist()})
val_data = Dataset.from_dict({'text': val_texts.tolist(), 'label': val_labels.tolist()})
test_data = Dataset.from_dict({'text': test_texts.tolist(), 'label': test_labels.tolist()})

# Load tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=3)

# Tokenize dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True)

train_data = train_data.map(tokenize_function, batched=True)
val_data = val_data.map(tokenize_function, batched=True)
test_data = test_data.map(tokenize_function, batched=True)

# Set format for PyTorch
train_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_data.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="epoch",    # evaluation strategy
    learning_rate=2e-5,             # learning rate
    per_device_train_batch_size=8,  # batch size for training
    per_device_eval_batch_size=8,   # batch size for evaluation
    num_train_epochs=3,             # number of epochs
    weight_decay=0.01,              # weight decay
    save_strategy="epoch",          # save model after each epoch
    logging_dir='./logs',           # directory for logs
    logging_steps=10,
    load_best_model_at_end=True,    # load best model at the end of training
)

# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('./sentiment_model')
tokenizer.save_pretrained('./sentiment_model')



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Map:   0%|          | 0/3500 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Map:   0%|          | 0/750 [00:00<?, ? examples/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.5207,0.478415
2,0.1899,0.418196
3,0.266,0.413995


('./sentiment_model/tokenizer_config.json',
 './sentiment_model/special_tokens_map.json',
 './sentiment_model/vocab.txt',
 './sentiment_model/added_tokens.json')

### Evauation

In [None]:
def evaluate_test_set(model, tokenizer, test_data):
    """
    Evaluate the model on the test dataset and return classification metrics.

    Args:
        model: Trained DistilBERT model.
        tokenizer: Tokenizer used for the model.
        test_data: Dataset to evaluate on.

    Returns:
        dict: Classification report including precision, recall, and F1-score.
    """
    # Determine device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    # Make predictions
    test_texts = test_data['text']
    test_labels = test_data['label']
    predictions = []

    for text in test_texts:
        # Tokenize and move input tensors to the appropriate device
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to(device)
        outputs = model(**inputs)
        pred = torch.argmax(outputs.logits, dim=1).item()
        predictions.append(pred)

    # Generate classification report
    report = classification_report(test_labels, predictions, target_names=label_map.keys())
    return report


In [None]:
# Run evaluation
evaluation_report = evaluate_test_set(model, tokenizer, test_data)
print(evaluation_report)

              precision    recall  f1-score   support

    Negative       0.78      0.66      0.71       112
     Neutral       0.92      0.96      0.94       480
    Positive       0.91      0.87      0.89       158

    accuracy                           0.90       750
   macro avg       0.87      0.83      0.85       750
weighted avg       0.90      0.90      0.90       750

