## Sentiment Analysis from CryptoLin

### Importing the data of CryptoLin for finBERT model Training 

In [1]:
import warnings
import pandas as pd
warnings.filterwarnings('ignore')
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [2]:
df = pd.read_csv("CryptoLin_IE_v2.csv")
c = df['final_manual_labelling'].value_counts()
p = df['final_manual_labelling'].value_counts(normalize=True)
pd.concat([c,p], axis=1, keys=['counts', '%'])

Unnamed: 0,counts,%
1,1366,0.509132
0,921,0.343272
-1,396,0.147596


In [3]:
df.columns

Index(['id', 'date', 'news', 'final_manual_labelling', 'text_span',
       'type_abnormal_return_fama_frech', 'vader', 'textblob', 'flair',
       'finbert_positive', 'finbert_negative', 'finbert_neutral',
       'vader_class', 'textblob_class', 'flair_class',
       'finbert_positive_class', 'finbert_negative_class',
       'finbert_neutral_class'],
      dtype='object')

In [4]:
df.drop(['id'],inplace=True,axis=1)

In [5]:
df.head()

Unnamed: 0,date,news,final_manual_labelling,text_span,type_abnormal_return_fama_frech,vader,textblob,flair,finbert_positive,finbert_negative,finbert_neutral,vader_class,textblob_class,flair_class,finbert_positive_class,finbert_negative_class,finbert_neutral_class
0,2022-01-25,"Ripple announces stock buyback, nabs $15 billi...",1,{annotator1_id:22;annotator1_label:1; annotato...,0,0.0,0.0,0.875877,0.098288,-0.020569,0.881142,0,-1,-1,1,0,1
1,2022-01-25,IMF directors urge El Salvador to remove Bitco...,-1,{annotator1_id:16;annotator1_label:-1; annotat...,0,0.128,0.2,0.998796,0.047823,-0.162971,0.789206,1,1,1,-1,-1,1
2,2022-01-25,Dragonfly Capital is raising $500 million for ...,1,{annotator1_id:45;annotator1_label:1; annotato...,0,0.0,0.136364,0.984027,0.156997,-0.008097,0.834906,0,1,1,1,1,1
3,2022-01-25,Rick and Morty co-creator collaborates with Pa...,0,{annotator1_id:32;annotator1_label:0; annotato...,0,0.0,0.0,0.996666,0.055608,-0.015489,0.928903,0,-1,1,0,0,1
4,2022-01-25,How fintech SPACs lost their shine,0,{annotator1_id:48;annotator1_label:0; annotato...,0,-0.3182,0.0,0.999921,0.039964,-0.472788,0.487248,-1,-1,1,-1,-1,-1


In [6]:
#We ae going to use the reduce manual_labeled df for training our finBERT model:
input_df = df[['date','news','final_manual_labelling','text_span']]

## FinBERT Sentiment Analysis
ref: https://wandb.ai/ivangoncharov/FinBERT_Sentiment_Analysis_Project/reports/Financial-Sentiment-Analysis-on-Stock-Market-Headlines-With-FinBERT-Hugging-Face--VmlldzoxMDQ4NjM0

HuggingFace makes it really easy for us to try out different NLP models. We can find the FinBERT model on the HuggingFace model hub (https://huggingface.co/ProsusAI/finbert) & even run a test inference using a little text box right on their website (https://huggingface.co/ProsusAI/finbert)! 

#### Using Finbert as it is, checking the output 

In [7]:
#!pip install transformers

In [8]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [9]:
tokenizer = AutoTokenizer.from_pretrained("ProsusAI/finbert")
model = AutoModelForSequenceClassification.from_pretrained("ProsusAI/finbert")

In [10]:
def apply_finbert(x):
    inputs = tokenizer([x], padding = True, truncation = True, return_tensors='pt')
    outputs = model(**inputs)
    predictions = torch.nn.functional.softmax(outputs.logits, dim=1) 
    return predictions[:, 0].tolist()[0], predictions[:, 1].tolist()[0], predictions[:, 2].tolist()[0]

In [11]:
input_df[['Positive','Negative','Neutral']] = (input_df['news'].apply(apply_finbert)).apply(pd.Series)

In [12]:
input_df

Unnamed: 0,date,news,final_manual_labelling,text_span,Positive,Negative,Neutral
0,2022-01-25,"Ripple announces stock buyback, nabs $15 billi...",1,{annotator1_id:22;annotator1_label:1; annotato...,0.098288,0.020569,0.881142
1,2022-01-25,IMF directors urge El Salvador to remove Bitco...,-1,{annotator1_id:16;annotator1_label:-1; annotat...,0.047823,0.162971,0.789207
2,2022-01-25,Dragonfly Capital is raising $500 million for ...,1,{annotator1_id:45;annotator1_label:1; annotato...,0.156997,0.008097,0.834906
3,2022-01-25,Rick and Morty co-creator collaborates with Pa...,0,{annotator1_id:32;annotator1_label:0; annotato...,0.055608,0.015489,0.928903
4,2022-01-25,How fintech SPACs lost their shine,0,{annotator1_id:48;annotator1_label:0; annotato...,0.039964,0.472788,0.487248
...,...,...,...,...,...,...,...
2678,2020-05-01,Gambling for a good cause  CryptoSlots donate...,1,{annotator1_id:80;annotator1_label:1; annotato...,0.178831,0.008580,0.812589
2679,2020-04-18,"Litecoin, The Chinese Alternative to Bitcoin",0,{annotator1_id:10;annotator1_label:0; annotato...,0.105272,0.009314,0.885414
2680,2020-04-10,Do You Know What is Happening to Money?,0,{annotator1_id:32;annotator1_label:0; annotato...,0.027453,0.304318,0.668229
2681,2018-07-30,Download CoinMarketCal app on App Store,0,{annotator1_id:33;annotator1_label:0; annotato...,0.046135,0.015244,0.938621


In [13]:
input_df[input_df['Positive']>0.5]['final_manual_labelling'].value_counts()

 1    419
 0     95
-1     11
Name: final_manual_labelling, dtype: int64

In [14]:
input_df[input_df['Negative']>0.5]['final_manual_labelling'].value_counts()

-1    202
 0     73
 1     32
Name: final_manual_labelling, dtype: int64

In [15]:
input_df[input_df['Neutral']>0.5]['final_manual_labelling'].value_counts()

 1    901
 0    743
-1    172
Name: final_manual_labelling, dtype: int64

In [16]:
def calculate_result(df):
    largo = df.shape[0]
    result = []
    for i in range(0,largo):
        if df.iloc[i]['Positive']>0.5:
            result.append(1)
        elif df.iloc[i]['Negative']>0.5:
            result.append(-1)
        elif df.iloc[i]['Neutral']>0.5:
            result.append(0)
        elif df.iloc[i]['Neutral']>df.iloc[i]['Positive'] and df.iloc[i]['Neutral']>df.iloc[i]['Negative']:
            result.append(0)
        elif df.iloc[i]['Positive']>df.iloc[i]['Neutral'] and df.iloc[i]['Positive']>df.iloc[i]['Negative']:
            result.append(1)
        else:
            result.append(-1)

    df['result']=result
    return df

In [17]:
non_trained_result = calculate_result(input_df)

In [18]:
non_trained_result.head(10)

Unnamed: 0,date,news,final_manual_labelling,text_span,Positive,Negative,Neutral,result
0,2022-01-25,"Ripple announces stock buyback, nabs $15 billi...",1,{annotator1_id:22;annotator1_label:1; annotato...,0.098288,0.020569,0.881142,0
1,2022-01-25,IMF directors urge El Salvador to remove Bitco...,-1,{annotator1_id:16;annotator1_label:-1; annotat...,0.047823,0.162971,0.789207,0
2,2022-01-25,Dragonfly Capital is raising $500 million for ...,1,{annotator1_id:45;annotator1_label:1; annotato...,0.156997,0.008097,0.834906,0
3,2022-01-25,Rick and Morty co-creator collaborates with Pa...,0,{annotator1_id:32;annotator1_label:0; annotato...,0.055608,0.015489,0.928903,0
4,2022-01-25,How fintech SPACs lost their shine,0,{annotator1_id:48;annotator1_label:0; annotato...,0.039964,0.472788,0.487248,0
5,2022-01-25,Multichain vulnerability put a billion dollars...,-1,{annotator1_id:77;annotator1_label:-1; annotat...,0.101357,0.194968,0.703675,0
6,2022-01-25,YouTube wants to help content creators capital...,0,{annotator1_id:52;annotator1_label:0; annotato...,0.475624,0.007213,0.517163,0
7,2022-01-25,OpenSea is reimbursing users who sold NFTs bel...,0,{annotator1_id:10;annotator1_label:0; annotato...,0.011183,0.952579,0.036238,-1
8,2022-01-25,GoodDollar Launches Key Protocol Upgrade to Ex...,1,{annotator1_id:22;annotator1_label:1; annotato...,0.853262,0.009177,0.137562,1
9,2022-01-25,BCB Group raises a $60 million Series A round ...,1,{annotator1_id:43;annotator1_label:1; annotato...,0.218266,0.010609,0.771125,0


In [19]:
#The accuracy with the original FinBERT:
accuracy_score(y_true=non_trained_result['final_manual_labelling'], y_pred=non_trained_result['result'])

0.5143496086470369

## Retraining Finbert

In [20]:
df = input_df[['news','final_manual_labelling']]

In [21]:
df.head()

Unnamed: 0,news,final_manual_labelling
0,"Ripple announces stock buyback, nabs $15 billi...",1
1,IMF directors urge El Salvador to remove Bitco...,-1
2,Dragonfly Capital is raising $500 million for ...,1
3,Rick and Morty co-creator collaborates with Pa...,0
4,How fintech SPACs lost their shine,0


In [22]:
df.columns = ['news','labels']
df.head()

Unnamed: 0,news,labels
0,"Ripple announces stock buyback, nabs $15 billi...",1
1,IMF directors urge El Salvador to remove Bitco...,-1
2,Dragonfly Capital is raising $500 million for ...,1
3,Rick and Morty co-creator collaborates with Pa...,0
4,How fintech SPACs lost their shine,0


In [23]:
#Giving a +1 offset to the target variable:
df['labels'].replace({1:2},inplace=True)
df['labels'].replace({0:1},inplace=True)
df['labels'].replace({-1:0},inplace=True)

In [24]:
df['labels'].value_counts()

2    1366
1     921
0     396
Name: labels, dtype: int64

In [25]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
import torch
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback
import numpy as np

In [26]:
# Preprocess data
X = list(df["news"])
y = list(df["labels"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_val_2, X_test, y_val_2, y_test = train_test_split(X_val, y_val, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val_2, padding=True, truncation=True, max_length=512)
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

In [27]:
# Create torch dataset
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [28]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)

In [29]:
# Define Trainer parameters
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    #recall = recall_score(y_true=labels, y_pred=pred,'weighted')
    #precision = precision_score(y_true=labels, y_pred=pred)
    #f1 = f1_score(y_true=labels, y_pred=pred)
    
    return {"accuracy": accuracy}#, "precision": precision, "recall": recall, "f1": f1}

In [30]:
# Define Trainer
args = TrainingArguments(
    output_dir="first_test",
    evaluation_strategy="steps",
    eval_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    seed=0,
    load_best_model_at_end=True,
)

In [31]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [32]:
# Train pre-trained model
trainer.train()

***** Running training *****
  Num examples = 2146
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1076


Step,Training Loss,Validation Loss,Accuracy
500,0.6808,1.950375,0.431235
1000,0.2728,3.390586,0.379953


***** Running Evaluation *****
  Num examples = 429
  Batch size = 8
Saving model checkpoint to first_test\checkpoint-500
Configuration saved in first_test\checkpoint-500\config.json
Model weights saved in first_test\checkpoint-500\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 429
  Batch size = 8
Saving model checkpoint to first_test\checkpoint-1000
Configuration saved in first_test\checkpoint-1000\config.json
Model weights saved in first_test\checkpoint-1000\pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from first_test\checkpoint-500 (score: 1.9503753185272217).


TrainOutput(global_step=1076, training_loss=0.451753720031795, metrics={'train_runtime': 4009.9953, 'train_samples_per_second': 2.141, 'train_steps_per_second': 0.268, 'total_flos': 185272957552032.0, 'train_loss': 0.451753720031795, 'epoch': 4.0})

In [33]:
# ----- 3. Predicting -----#
# Create torch dataset
test_dataset = Dataset(X_test_tokenized, y_test)

In [34]:
# Load trained model
model_path = "first_test/checkpoint-500"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=3)

loading configuration file first_test/checkpoint-500\config.json
Model config BertConfig {
  "_name_or_path": "ProsusAI/finbert",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 1,
    "neutral": 2,
    "positive": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.20.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights file first

In [35]:
# Define test trainer
test_trainer = Trainer(model)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [36]:
 #Make prediction
raw_pred, _, _ = test_trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 108
  Batch size = 8


In [37]:
# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

In [38]:
#The accuracy with the re-trained FinBERT:
accuracy_score(y_true=y_test, y_pred=y_pred)

0.6759259259259259

In [39]:
#The accuracy of the original finbert:
X_test_df = pd.DataFrame()
X_test_df['news']=X_test
X_test_df['labels']=y_test

In [40]:
X_test_df.head()

Unnamed: 0,news,labels
0,DeFi platform Parsec launches with $1.25 milli...,2
1,Ontario securities regulator slams Binance aft...,0
2,CME to launch micro ether futures next month,2
3,Coinbase to offer secondary market for private...,2
4,Bank of France completes CBDC settlement exper...,1


In [41]:
X_test_df.shape

(108, 2)

In [42]:
X_test_df['labels'].replace({0:-1},inplace=True)
X_test_df['labels'].replace({1:0},inplace=True)
X_test_df['labels'].replace({2:1},inplace=True)

In [43]:
X_test_df.head()

Unnamed: 0,news,labels
0,DeFi platform Parsec launches with $1.25 milli...,1
1,Ontario securities regulator slams Binance aft...,-1
2,CME to launch micro ether futures next month,1
3,Coinbase to offer secondary market for private...,1
4,Bank of France completes CBDC settlement exper...,0


In [44]:
X_test_df[['Positive','Negative','Neutral']] = (X_test_df['news'].apply(apply_finbert)).apply(pd.Series)

In [45]:
result_df = calculate_result(X_test_df)

In [46]:
result_df.head(10)

Unnamed: 0,news,labels,Positive,Negative,Neutral,result
0,DeFi platform Parsec launches with $1.25 milli...,1,0.00441,0.041571,0.954019,0
1,Ontario securities regulator slams Binance aft...,-1,0.9548,0.034571,0.010629,1
2,CME to launch micro ether futures next month,1,0.007117,0.230409,0.762474,0
3,Coinbase to offer secondary market for private...,1,0.005643,0.198445,0.795912,0
4,Bank of France completes CBDC settlement exper...,0,0.007316,0.18745,0.805234,0
5,State Street named fund administrator for VanE...,1,0.004823,0.09123,0.903947,0
6,Rise in privately funded crypto unicorns worri...,-1,0.198039,0.099655,0.702306,0
7,The MiamiCoin project has generated $22.5 mill...,0,0.00615,0.044683,0.949167,0
8,"Gemini pledges to offset 350,000 tons of bitco...",1,0.857962,0.103936,0.038102,1
9,Citadel Securities raises $1.15 billion from S...,1,0.004999,0.040776,0.954226,0


In [47]:
result_df.shape

(108, 6)

In [48]:
#The accuracy with the original FinBERT:
accuracy_score(y_true=result_df['labels'], y_pred=result_df['result'])

0.18518518518518517

### Retraining with a second set of arguments 

In [49]:
# Define Trainer
args = TrainingArguments(
        output_dir = 'second_test/',
        evaluation_strategy = 'epoch',
        save_strategy = 'epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=4,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
)


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [50]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [51]:
# Train pre-trained model
trainer.train()

***** Running training *****
  Num examples = 2146
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 272


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.270327,0.386946
2,No log,2.563083,0.379953
3,No log,2.832335,0.384615
4,No log,2.900656,0.386946


***** Running Evaluation *****
  Num examples = 429
  Batch size = 32
Saving model checkpoint to second_test/checkpoint-68
Configuration saved in second_test/checkpoint-68\config.json
Model weights saved in second_test/checkpoint-68\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 429
  Batch size = 32
Saving model checkpoint to second_test/checkpoint-136
Configuration saved in second_test/checkpoint-136\config.json
Model weights saved in second_test/checkpoint-136\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 429
  Batch size = 32
Saving model checkpoint to second_test/checkpoint-204
Configuration saved in second_test/checkpoint-204\config.json
Model weights saved in second_test/checkpoint-204\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 429
  Batch size = 32
Saving model checkpoint to second_test/checkpoint-272
Configuration saved in second_test/checkpoint-272\config.json
Model weights saved in second_test/checkpoint-272\pytor

TrainOutput(global_step=272, training_loss=0.17228883855483113, metrics={'train_runtime': 3319.2386, 'train_samples_per_second': 2.586, 'train_steps_per_second': 0.082, 'total_flos': 185272957552032.0, 'train_loss': 0.17228883855483113, 'epoch': 4.0})

In [52]:
# ----- 3. Predicting -----#
# Create torch dataset
test_dataset = Dataset(X_test_tokenized, y_test)

In [53]:
# Load trained model
model_path = "second_test/checkpoint-204"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=3)

loading configuration file second_test/checkpoint-204\config.json
Model config BertConfig {
  "_name_or_path": "first_test/checkpoint-500",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 1,
    "neutral": 2,
    "positive": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.20.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights 

In [54]:
# Define test trainer
test_trainer = Trainer(model)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [55]:
 #Make prediction
raw_pred, _, _ = test_trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 108
  Batch size = 8


In [56]:
# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

In [57]:
#The accuracy with the re-trained FinBERT:
accuracy_score(y_true=y_test, y_pred=y_pred)

0.6759259259259259

## Testing different approaches to improve the model accuracy 

### Removing the Stopwords 

In [58]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
df['news'] = df['news'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

[nltk_data] Downloading package stopwords to C:\Users\Nicolas
[nltk_data]     Ponte\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [59]:
df["news"] = df["news"].str.lower()

In [60]:
import string
df["news"] = df['news'].str.replace('[^\w\s]','')
print(df['news'])

0       ripple announces stock buyback nabs 15 billion...
1       imf directors urge el salvador remove bitcoin ...
2          dragonfly capital raising 500 million new fund
3       rick morty cocreator collaborates paradigm nft...
4                            how fintech spacs lost shine
                              ...                        
2678    gambling good cause  cryptoslots donates proce...
2679             litecoin the chinese alternative bitcoin
2680                     do you know what happening money
2681                 download coinmarketcal app app store
2682               download coinmarketcal app google play
Name: news, Length: 2683, dtype: object


In [61]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')

lemmatizer = WordNetLemmatizer()

#Function to apply for each word the proper lemmatization.
def lemmetize_titles(words):
    a = []
    tokens = word_tokenize(words)
    for token in tokens:
        lemmetized_word = lemmatizer.lemmatize(token)
        a.append(lemmetized_word)
    lemmatized_title = ' '.join(a)
    return lemmatized_title

[nltk_data] Downloading package punkt to C:\Users\Nicolas
[nltk_data]     Ponte\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Nicolas
[nltk_data]     Ponte\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [62]:
df['lemmetized_titles'] = df['news'].apply(lemmetize_titles)

In [63]:
df.head(20)

Unnamed: 0,news,labels,lemmetized_titles
0,ripple announces stock buyback nabs 15 billion...,2,ripple announces stock buyback nabs 15 billion...
1,imf directors urge el salvador remove bitcoin ...,0,imf director urge el salvador remove bitcoin l...
2,dragonfly capital raising 500 million new fund,2,dragonfly capital raising 500 million new fund
3,rick morty cocreator collaborates paradigm nft...,1,rick morty cocreator collaborates paradigm nft...
4,how fintech spacs lost shine,1,how fintech spacs lost shine
5,multichain vulnerability put billion dollars r...,0,multichain vulnerability put billion dollar ri...
6,youtube wants help content creators capitalize...,1,youtube want help content creator capitalize nfts
7,opensea reimbursing users sold nfts market val...,1,opensea reimbursing user sold nfts market valu...
8,gooddollar launches key protocol upgrade expan...,2,gooddollar launch key protocol upgrade expand ...
9,bcb group raises 60 million series a round col...,2,bcb group raise 60 million series a round cole...


In [64]:
# Preprocess data
X = list(df["lemmetized_titles"])
y = list(df["labels"])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)
X_val_2, X_test, y_val_2, y_test = train_test_split(X_val, y_val, test_size=0.2)
X_train_tokenized = tokenizer(X_train, padding=True, truncation=True, max_length=512)
X_val_tokenized = tokenizer(X_val_2, padding=True, truncation=True, max_length=512)
X_test_tokenized = tokenizer(X_test, padding=True, truncation=True, max_length=512)

In [65]:
train_dataset = Dataset(X_train_tokenized, y_train)
val_dataset = Dataset(X_val_tokenized, y_val)
test_dataset = Dataset(X_test_tokenized, y_test)

In [66]:
# Define Trainer
args = TrainingArguments(
        output_dir = 'third_test/',
        evaluation_strategy = 'epoch',
        save_strategy = 'epoch',
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=4,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model='accuracy',
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [67]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [68]:
# Train pre-trained model
trainer.train()

***** Running training *****
  Num examples = 2146
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 1076


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.963239,0.421911
2,0.403400,2.88425,0.410256
3,0.403400,3.467348,0.428904
4,0.157600,3.653231,0.426573


***** Running Evaluation *****
  Num examples = 429
  Batch size = 8
Saving model checkpoint to third_test/checkpoint-269
Configuration saved in third_test/checkpoint-269\config.json
Model weights saved in third_test/checkpoint-269\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 429
  Batch size = 8
Saving model checkpoint to third_test/checkpoint-538
Configuration saved in third_test/checkpoint-538\config.json
Model weights saved in third_test/checkpoint-538\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 429
  Batch size = 8
Saving model checkpoint to third_test/checkpoint-807
Configuration saved in third_test/checkpoint-807\config.json
Model weights saved in third_test/checkpoint-807\pytorch_model.bin
***** Running Evaluation *****
  Num examples = 429
  Batch size = 8
Saving model checkpoint to third_test/checkpoint-1076
Configuration saved in third_test/checkpoint-1076\config.json
Model weights saved in third_test/checkpoint-1076\pytorch_model.b

TrainOutput(global_step=1076, training_loss=0.26973394301744197, metrics={'train_runtime': 3369.8574, 'train_samples_per_second': 2.547, 'train_steps_per_second': 0.319, 'total_flos': 136749087716976.0, 'train_loss': 0.26973394301744197, 'epoch': 4.0})

In [69]:
# Load trained model
model_path = "third_test/checkpoint-269"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=3)

loading configuration file third_test/checkpoint-269\config.json
Model config BertConfig {
  "_name_or_path": "second_test/checkpoint-204",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "positive",
    "1": "negative",
    "2": "neutral"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "negative": 1,
    "neutral": 2,
    "positive": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  "torch_dtype": "float32",
  "transformers_version": "4.20.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading weights 

In [70]:
# Define test trainer
test_trainer = Trainer(model)

No `TrainingArguments` passed, using `output_dir=tmp_trainer`.
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [71]:
 #Make prediction
raw_pred, _, _ = test_trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 108
  Batch size = 8


In [72]:
# Preprocess raw predictions
y_pred = np.argmax(raw_pred, axis=1)

In [73]:
#The accuracy with the re-trained FinBERT:
accuracy_score(y_true=y_test, y_pred=y_pred)

0.8333333333333334

In [74]:
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score, confusion_matrix

In [75]:
confusion_matrix(y_true=y_test, y_pred=y_pred)

array([[18,  1,  1],
       [ 2, 22, 11],
       [ 0,  3, 50]], dtype=int64)

In [76]:
raw_pred, _, _ = test_trainer.predict(train_dataset)

***** Running Prediction *****
  Num examples = 2146
  Batch size = 8


In [77]:
y_pred = np.argmax(raw_pred, axis=1)

In [78]:
accuracy_score(y_true=y_train, y_pred=y_pred)

0.9249767008387698

In [79]:
confusion_matrix(y_true=y_train, y_pred=y_pred)

array([[ 304,   10,    7],
       [  18,  617,   99],
       [   4,   23, 1064]], dtype=int64)

### Checking Manoel Result 

In [109]:
df_manoel = pd.read_csv('../OneYearNewsDataset_AfterRelevance.csv')

In [110]:
df_manoel.head(5)

Unnamed: 0.1,Unnamed: 0,url,url_mobile,title,seendate,socialimage,domain,language,sourcecountry,lemmetized_titles,relevance_probability,relevance_class
0,0,https://www.digitaljournal.com/pr/longhash-ven...,,LongHash Ventures and Terraform Labs Join Forc...,20220406T163000Z,,digitaljournal.com,English,United States,longhash venture terraform lab join force adva...,0.608537,1.0
1,1,https://www.prnewswire.com/news-releases/terra...,,TERRA . DO TO COMPETE IN FINAL 20 GROUP FOR ED...,20220406T001500Z,,prnewswire.com,English,United States,terra compete final 20 group edtech competitio...,0.289958,0.0
2,2,https://www.fool.com/investing/2022/04/06/can-...,,Can THORchain Keep Surging ? | The Motley Fool,20220406T120000Z,,fool.com,English,United States,thorchain keep surging motley fool,0.288098,0.0
3,3,https://www.finanznachrichten.de/nachrichten-2...,,Gold Terra Resource Corp : Gold Terra Intersec...,20220406T123000Z,,finanznachrichten.de,English,Germany,gold terra resource corp gold terra intersects...,0.201722,0.0
4,4,https://economictimes.indiatimes.com/tech/tech...,,Crypto wallet Leap raises $3 . 2 million throu...,20220406T114500Z,,economictimes.indiatimes.com,English,India,crypto wallet leap raise 3 2 million token sale,0.719981,1.0


In [111]:
df_manoel.drop(['Unnamed: 0'],axis=1,inplace=True)

In [112]:
df_manoel.head(5)

Unnamed: 0,url,url_mobile,title,seendate,socialimage,domain,language,sourcecountry,lemmetized_titles,relevance_probability,relevance_class
0,https://www.digitaljournal.com/pr/longhash-ven...,,LongHash Ventures and Terraform Labs Join Forc...,20220406T163000Z,,digitaljournal.com,English,United States,longhash venture terraform lab join force adva...,0.608537,1.0
1,https://www.prnewswire.com/news-releases/terra...,,TERRA . DO TO COMPETE IN FINAL 20 GROUP FOR ED...,20220406T001500Z,,prnewswire.com,English,United States,terra compete final 20 group edtech competitio...,0.289958,0.0
2,https://www.fool.com/investing/2022/04/06/can-...,,Can THORchain Keep Surging ? | The Motley Fool,20220406T120000Z,,fool.com,English,United States,thorchain keep surging motley fool,0.288098,0.0
3,https://www.finanznachrichten.de/nachrichten-2...,,Gold Terra Resource Corp : Gold Terra Intersec...,20220406T123000Z,,finanznachrichten.de,English,Germany,gold terra resource corp gold terra intersects...,0.201722,0.0
4,https://economictimes.indiatimes.com/tech/tech...,,Crypto wallet Leap raises $3 . 2 million throu...,20220406T114500Z,,economictimes.indiatimes.com,English,India,crypto wallet leap raise 3 2 million token sale,0.719981,1.0


In [113]:
df_manoel['lemmetized_titles'].isna().sum()

82

In [114]:
#dropping blanks:
df_manoel.dropna(subset=['lemmetized_titles'],inplace=True)

In [115]:
df_manoel.shape

(243422, 11)

In [116]:
X = df_manoel['lemmetized_titles'].tolist()

In [117]:
X

['longhash venture terraform lab join force advance web3 project terra blockchain',
 'terra compete final 20 group edtech competition asu gsv summit san diego',
 'thorchain keep surging motley fool',
 'gold terra resource corp gold terra intersects 6 41 gold 26 50 metre including 14 15 g 5 50 meter yellorex zone yellowknife nwt drilling continues con mine property',
 'crypto wallet leap raise 3 2 million token sale',
 'top photo day jp reuters com',
 'speakeasy bar columbus five try',
 'shop 111 west 57th street supertall tower one important step closer completion',
 'abb technology help copenhagen airport become denmark largest ev charging site',
 'longhash announces partnership terraform lab',
 'renowned canadian conductor boris brott died',
 'bitcoin ethereum positive momentum revers',
 '3 restaurant dine april',
 'national 2022 season exciting new thing national park dc',
 'technology focus national park prepares new season',
 'spinlaunch rocket flinging launch system loft nasa pay

In [118]:
len(X)

243422

In [119]:
y = np.ones(len(X))

In [120]:
len(y)

243422

In [121]:
y = [int(x) for x in y]

In [122]:
XT = tokenizer(X, padding=True, truncation=True, max_length=512)

In [123]:
train_dataset = Dataset(XT, y)

In [124]:
raw_pred, raw_pred2, raw_pred3 = test_trainer.predict(train_dataset)

***** Running Prediction *****
  Num examples = 243422
  Batch size = 8


In [125]:
raw_pred

array([[-2.91391   ,  1.1038746 ,  2.7978225 ],
       [-1.9013687 ,  2.2464314 ,  0.5738592 ],
       [-0.6870062 ,  1.4756547 ,  0.14030182],
       ...,
       [-1.4556764 ,  2.7135031 , -0.9773732 ],
       [ 2.3284633 , -0.08987612, -2.873111  ],
       [-2.6961954 ,  0.14528114,  3.3509033 ]], dtype=float32)

In [126]:
raw_pred

array([[-2.91391   ,  1.1038746 ,  2.7978225 ],
       [-1.9013687 ,  2.2464314 ,  0.5738592 ],
       [-0.6870062 ,  1.4756547 ,  0.14030182],
       ...,
       [-1.4556764 ,  2.7135031 , -0.9773732 ],
       [ 2.3284633 , -0.08987612, -2.873111  ],
       [-2.6961954 ,  0.14528114,  3.3509033 ]], dtype=float32)

In [127]:
y_pred = np.argmax(raw_pred, axis=1)

In [128]:
y_pred

array([2, 1, 1, ..., 1, 0, 2], dtype=int64)

In [129]:
raw_pred[0]

array([-2.91391  ,  1.1038746,  2.7978225], dtype=float32)

In [130]:
from math import exp

In [131]:
exp(raw_pred[0][0])/(exp(raw_pred[0][0])+exp(raw_pred[0][1])+exp(raw_pred[0][2]))

0.0027857301512203957

In [132]:
exp(raw_pred[0][1])/(exp(raw_pred[0][0])+exp(raw_pred[0][1])+exp(raw_pred[0][2]))

0.1548248515637921

In [133]:
exp(raw_pred[0][2])/(exp(raw_pred[0][0])+exp(raw_pred[0][1])+exp(raw_pred[0][2]))

0.8423894182849875

In [134]:
neg_class = []
neut_class = []
pos_class = []

for item in raw_pred:
    neg = exp(item[0])/(exp(item[0])+exp(item[1])+exp(item[2]))
    neut = exp(item[1])/(exp(item[0])+exp(item[1])+exp(item[2]))
    pos = exp(item[2])/(exp(item[0])+exp(item[1])+exp(item[2]))
    neg_class.append(neg)
    neut_class.append(neut)
    pos_class.append(pos)

In [135]:
df_manoel['sentiment_negative_probability']=neg_class
df_manoel['sentiment_neutral_probability']=neut_class
df_manoel['sentiment_positive_probability']=pos_class
df_manoel['sentiment_class']=y_pred-1

In [136]:
df_manoel.to_csv('../OneYearNewsDataset_AfterSentiment.csv')