# Try using NLP model

In [1]:
import pandas as pd
df = pd.read_csv('reddit_data_1_2021.csv')

## Data exploration

In [2]:
df.head(8)

Unnamed: 0,post_id,title,selftext,url,author,score,publish_date,num_of_comments,permalink,flair
0,pdjv8l,Bitcoin generated $2 billion in profits for Mi...,,https://digesttime.com/2021/08/28/bitcoin-gene...,thefoodboylover,1,2021-08-29 00:01:06,23,/r/Bitcoin/comments/pdjv8l/bitcoin_generated_2...,
1,pdkb1m,Bitcoin will shoot.,Bitcoin will def shoot rapidly and would be mo...,https://www.reddit.com/r/Bitcoin/comments/pdkb...,tammy_lee112,1,2021-08-29 00:28:51,0,/r/Bitcoin/comments/pdkb1m/bitcoin_will_shoot/,
2,pdkmwp,I have impossible idea. It is the incredible s...,We understand an important issue: the technolo...,https://www.reddit.com/r/Bitcoin/comments/pdkm...,cryptosyor,1,2021-08-29 00:49:57,39,/r/Bitcoin/comments/pdkmwp/i_have_impossible_i...,
3,pdkul3,Has anyone brainstormed about how Bitcoin can ...,Is it possible? It must be. I don't know enoug...,https://www.reddit.com/r/Bitcoin/comments/pdku...,dikgumdur,1,2021-08-29 01:03:33,26,/r/Bitcoin/comments/pdkul3/has_anyone_brainsto...,
4,pdl3ym,Gold mining is the largest source of CHILD LAB...,I was reading some articles today and went dow...,https://www.reddit.com/r/Bitcoin/comments/pdl3...,saccred,1,2021-08-29 01:19:52,147,/r/Bitcoin/comments/pdl3ym/gold_mining_is_the_...,
5,pdl8is,Recommend me a Bitcoin-only Wallet,Sets of requirements I needed:\n- trusted and ...,https://www.reddit.com/r/Bitcoin/comments/pdl8...,wolfur_,1,2021-08-29 01:28:04,14,/r/Bitcoin/comments/pdl8is/recommend_me_a_bitc...,
6,pdlb7g,What you should invest between Bitcoin and gold?,,https://vm.tiktok.com/ZSJcxTQyS/,StevenPhanVN,1,2021-08-29 01:33:01,11,/r/Bitcoin/comments/pdlb7g/what_you_should_inv...,
7,pdle10,"Earn Bitcoin, Dice, Plinko,Crash and More game",,https://luckyfish.xyz/?c=c_jnvawv7izs91,Skeemy34,1,2021-08-29 01:38:13,0,/r/Bitcoin/comments/pdle10/earn_bitcoin_dice_p...,


In [3]:
X=df[['title', 'selftext','score', 'publish_date', 'num_of_comments']]
X.head(2)

Unnamed: 0,title,selftext,score,publish_date,num_of_comments
0,Bitcoin generated $2 billion in profits for Mi...,,1,2021-08-29 00:01:06,23
1,Bitcoin will shoot.,Bitcoin will def shoot rapidly and would be mo...,1,2021-08-29 00:28:51,0


In [4]:
X.groupby(['num_of_comments']).nunique()

Unnamed: 0_level_0,title,selftext,score,publish_date
num_of_comments,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,2352,254,6,2444
1,702,190,7,708
2,474,168,7,475
3,291,137,7,291
4,269,141,9,270
...,...,...,...,...
908,1,0,1,1
1082,1,0,1,1
1096,1,0,1,1
1679,1,1,1,1


## Using the model

### Exploring the example provided

The model used is in this link:
    https://huggingface.co/siebert/sentiment-roberta-large-english

And the example on google colab can be found here: https://colab.research.google.com/github/chrsiebert/sentiment-roberta-large-english/blob/main/sentiment_roberta_prediction_example.ipynb

#### Installation of some stuff:

In [5]:
# Install the transformers library
!pip install transformers



Should I include transformers to the requirements?

In [6]:
!pip install torch



In [7]:
import torch

In [9]:
!pip install ipywidgets





In [10]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [11]:
from ipywidgets import IntProgress

In [12]:
pip show tqdm

Name: tqdm
Version: 4.62.2
Summary: Fast, Extensible Progress Meter
Home-page: https://tqdm.github.io
Author: 
Author-email: 
License: MPLv2.0, MIT Licences
Location: /home/claudia/.pyenv/versions/3.8.6/envs/cryptocurrency_trading/lib/python3.8/site-packages
Requires: 
Required-by: twine, transformers, sacremoses, huggingface-hub
Note: you may need to restart the kernel to use updated packages.


In [13]:
pip install --upgrade jupyter_client


Note: you may need to restart the kernel to use updated packages.


#### Running model

In [23]:
# Import required packages
import torch
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer

# Create class for data preparation
class SimpleDataset:
    def __init__(self, tokenized_texts):
        self.tokenized_texts = tokenized_texts
    
    def __len__(self):
        return len(self.tokenized_texts["input_ids"])
    
    def __getitem__(self, idx):
        return {k: v[idx] for k, v in self.tokenized_texts.items()}

In [14]:
# Load tokenizer and model, create trainer
model_name = "siebert/sentiment-roberta-large-english"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
trainer = Trainer(model=model)

In [15]:
torch.rand(5, 3)

tensor([[0.8823, 0.9150, 0.3829],
        [0.9593, 0.3904, 0.6009],
        [0.2566, 0.7936, 0.9408],
        [0.1332, 0.9346, 0.5936],
        [0.8694, 0.5677, 0.7411]])

In [59]:
# Create list of texts (can be imported from .csv, .xls etc.)
pred_texts = ['I like that','That is annoying','This is great!','Wouldn´t recommend it.']

In [60]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

In [61]:
# Run predictions
predictions = trainer.predict(pred_dataset)

***** Running Prediction *****
  Num examples = 4
  Batch size = 8


In [62]:
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [63]:
# Create DataFrame with texts, predictions, labels, and scores
df = pd.DataFrame(list(zip(pred_texts,preds,labels,scores)), columns=['text','pred','label','score'])
df.head()

Unnamed: 0,text,pred,label,score
0,I like that,1,POSITIVE,0.998657
1,That is annoying,0,NEGATIVE,0.999409
2,This is great!,1,POSITIVE,0.998727
3,Wouldn´t recommend it.,0,NEGATIVE,0.999486


### Trying with our data:

In [64]:
X.head(2)

Unnamed: 0,title,selftext,score,publish_date,num_of_comments
0,Bitcoin generated $2 billion in profits for Mi...,,1,2021-08-29 00:01:06,23
1,Bitcoin will shoot.,Bitcoin will def shoot rapidly and would be mo...,1,2021-08-29 00:28:51,0


In [76]:
list(X.dropna(subset = ['selftext'])['selftext'][0:4]);

In [71]:
to_predict = list(X.dropna(subset = ['selftext'])['selftext'][0:4])

In [72]:
# Tokenize texts and create prediction data set
tokenized_texts = tokenizer(to_predict,truncation=True,padding=True)
pred_dataset = SimpleDataset(tokenized_texts)

In [73]:
predictions = trainer.predict(pred_dataset)

***** Running Prediction *****
  Num examples = 4
  Batch size = 8


In [74]:
# Transform predictions to labels
preds = predictions.predictions.argmax(-1)
labels = pd.Series(preds).map(model.config.id2label)
scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)

In [75]:
df = pd.DataFrame(list(zip(to_predict,preds,labels,scores)), columns=['text','pred','label','score'])
df.head()

Unnamed: 0,text,pred,label,score
0,Bitcoin will def shoot rapidly and would be mo...,1,POSITIVE,0.998567
1,We understand an important issue: the technolo...,1,POSITIVE,0.998622
2,Is it possible? It must be. I don't know enoug...,0,NEGATIVE,0.993315
3,I was reading some articles today and went dow...,1,POSITIVE,0.998422


### Try to make a pipeline out of it

In [77]:
from sklearn.pipeline import Pipeline

In [None]:
# Pipeline([
    
# ])

In [80]:
# maybe a function works better:
def create_preddiction(pred_texts):
    tokenized_texts = tokenizer(pred_texts,truncation=True,padding=True)
    pred_dataset = SimpleDataset(tokenized_texts)
    # Run predictions
    predictions = trainer.predict(pred_dataset)
    # Transform predictions to labels
    preds = predictions.predictions.argmax(-1)
    labels = pd.Series(preds).map(model.config.id2label)
    scores = (np.exp(predictions[0])/np.exp(predictions[0]).sum(-1,keepdims=True)).max(1)
    # Create a dataframe to return:
    return pd.DataFrame(list(zip(to_predict,preds,labels,scores)), columns=['text','pred','label','score'])

In [81]:
create_preddiction(pred_texts)

***** Running Prediction *****
  Num examples = 4
  Batch size = 8


Unnamed: 0,text,pred,label,score
0,Bitcoin will def shoot rapidly and would be mo...,1,POSITIVE,0.998657
1,We understand an important issue: the technolo...,0,NEGATIVE,0.999409
2,Is it possible? It must be. I don't know enoug...,1,POSITIVE,0.998727
3,I was reading some articles today and went dow...,0,NEGATIVE,0.999486


In [None]:
# to call the model:
