In [4]:
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
nltk.download('vader_lexicon')

# Load your CSV data into a DataFrame
df = pd.read_csv('tweets.csv', sep=';')
df.head()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/mehdimouden/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
  df = pd.read_csv('tweets.csv', sep=';')


Unnamed: 0,id,user,fullname,url,timestamp,replies,likes,retweets,text
0,1132977055300300800,KamdemAbdiel,Abdiel kamdem,,2019-05-27 11:49:14+00,0.0,0.0,0.0,È appena uscito un nuovo video! LES CRYPTOMONN...
1,1132977073402736640,bitcointe,Bitcointe,,2019-05-27 11:49:18+00,0.0,0.0,0.0,Cardano: Digitize Currencies; EOS https://t.co...
2,1132977023893139456,3eyedbran,Bran - 3 Eyed Raven,,2019-05-27 11:49:06+00,0.0,2.0,1.0,Another Test tweet that wasn't caught in the s...
3,1132977089089556481,DetroitCrypto,J. Scardina,,2019-05-27 11:49:22+00,0.0,0.0,0.0,Current Crypto Prices! \n\nBTC: $8721.99 USD\n...
4,1132977092340191232,mmursaleen72,Muhammad Mursaleen,,2019-05-27 11:49:23+00,0.0,0.0,0.0,Spiv (Nosar Baz): BITCOIN Is An Asset &amp; NO...


In [15]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
from tqdm import tqdm



In [7]:
# Define a function to preprocess the text by removing hashtags, non-English characters, and URLs
def preprocess_text(text):
    if isinstance(text, str):  # Check if 'text' is a string
        # Remove URLs
        text = re.sub(r'http\S+', '', text)
        text = re.sub(r'www\S+', '', text)
        # Remove hashtags
        text = re.sub(r'#\w+', '', text)
        # Remove non-English alphabet characters and digits
        text = re.sub(r'[^a-zA-Z\s]', '', text)
        # Remove extra whitespace
        text = ' '.join(text.split())
    else:
        # Handle missing values (e.g., set them to an empty string)
        text = ''
    return text

In [8]:
# Apply the preprocessing function to the 'text' column
# Apply the preprocessing function to the 'text' column
df['text'] = df['text'].apply(preprocess_text)
df.head()

Unnamed: 0,id,user,fullname,url,timestamp,replies,likes,retweets,text
0,1132977055300300800,KamdemAbdiel,Abdiel kamdem,,2019-05-27 11:49:14+00,0.0,0.0,0.0,appena uscito un nuovo video LES CRYPTOMONNAIE...
1,1132977073402736640,bitcointe,Bitcointe,,2019-05-27 11:49:18+00,0.0,0.0,0.0,Cardano Digitize Currencies EOS ROI ATampT Bit...
2,1132977023893139456,3eyedbran,Bran - 3 Eyed Raven,,2019-05-27 11:49:06+00,0.0,2.0,1.0,Another Test tweet that wasnt caught in the st...
3,1132977089089556481,DetroitCrypto,J. Scardina,,2019-05-27 11:49:22+00,0.0,0.0,0.0,Current Crypto Prices BTC USD ETH USD LTC USD ...
4,1132977092340191232,mmursaleen72,Muhammad Mursaleen,,2019-05-27 11:49:23+00,0.0,0.0,0.0,Spiv Nosar Baz BITCOIN Is An Asset amp NOT A C...


In [9]:
# Initialize the VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

In [10]:
# Create a function to apply VADER sentiment analysis and add sentiment scores to the DataFrame
def get_sentiment(text):
    sentiment = analyzer.polarity_scores(text)
    return sentiment


In [11]:
# Apply the sentiment analysis function to the preprocessed 'text' column
df['sentiment'] = df['text'].apply(get_sentiment)

df.head()

Unnamed: 0,id,user,fullname,url,timestamp,replies,likes,retweets,text,sentiment
0,1132977055300300800,KamdemAbdiel,Abdiel kamdem,,2019-05-27 11:49:14+00,0.0,0.0,0.0,appena uscito un nuovo video LES CRYPTOMONNAIE...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
1,1132977073402736640,bitcointe,Bitcointe,,2019-05-27 11:49:18+00,0.0,0.0,0.0,Cardano Digitize Currencies EOS ROI ATampT Bit...,"{'neg': 0.123, 'neu': 0.877, 'pos': 0.0, 'comp..."
2,1132977023893139456,3eyedbran,Bran - 3 Eyed Raven,,2019-05-27 11:49:06+00,0.0,2.0,1.0,Another Test tweet that wasnt caught in the st...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
3,1132977089089556481,DetroitCrypto,J. Scardina,,2019-05-27 11:49:22+00,0.0,0.0,0.0,Current Crypto Prices BTC USD ETH USD LTC USD ...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
4,1132977092340191232,mmursaleen72,Muhammad Mursaleen,,2019-05-27 11:49:23+00,0.0,0.0,0.0,Spiv Nosar Baz BITCOIN Is An Asset amp NOT A C...,"{'neg': 0.0, 'neu': 0.783, 'pos': 0.217, 'comp..."


In [13]:
# Extract compound sentiment scores
df['compound_sentiment'] = df['sentiment'].apply(lambda x: x['compound'])
df.head()

Unnamed: 0,id,user,fullname,url,timestamp,replies,likes,retweets,text,sentiment,compound_sentiment
0,1132977055300300800,KamdemAbdiel,Abdiel kamdem,,2019-05-27 11:49:14+00,0.0,0.0,0.0,appena uscito un nuovo video LES CRYPTOMONNAIE...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0
1,1132977073402736640,bitcointe,Bitcointe,,2019-05-27 11:49:18+00,0.0,0.0,0.0,Cardano Digitize Currencies EOS ROI ATampT Bit...,"{'neg': 0.123, 'neu': 0.877, 'pos': 0.0, 'comp...",-0.1027
2,1132977023893139456,3eyedbran,Bran - 3 Eyed Raven,,2019-05-27 11:49:06+00,0.0,2.0,1.0,Another Test tweet that wasnt caught in the st...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0
3,1132977089089556481,DetroitCrypto,J. Scardina,,2019-05-27 11:49:22+00,0.0,0.0,0.0,Current Crypto Prices BTC USD ETH USD LTC USD ...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0
4,1132977092340191232,mmursaleen72,Muhammad Mursaleen,,2019-05-27 11:49:23+00,0.0,0.0,0.0,Spiv Nosar Baz BITCOIN Is An Asset amp NOT A C...,"{'neg': 0.0, 'neu': 0.783, 'pos': 0.217, 'comp...",0.3612


In [22]:
# Initialize finBERT sentiment model
finbert_model_name = "ProsusAI/finbert"
max_length = 512  # You can adjust this value as needed
finbert_tokenizer = BertTokenizer.from_pretrained(finbert_model_name, max_length=max_length)
finbert_model = BertForSequenceClassification.from_pretrained(finbert_model_name)

Downloading (…)okenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

In [23]:
# Function to compute finBERT sentiment scores
def compute_finbert_sentiment(text):
    tokens = finbert_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=max_length)
    
    with torch.no_grad():
        output = finbert_model(**tokens)
        logit = output.logits
    return logit

In [35]:
df['finbert_sentiment'][1] = df['text'].apply(compute_finbert_sentiment)
# Extract the sentiment score from finBERT results
df['finbert_score'] = df['finbert_sentiment'].apply(lambda x: x[0][0].item())
df.head()

AttributeError: 'str' object has no attribute 'apply'

In [None]:
# Print the DataFrame with sentiment scores
print(df[['text', 'compound_score', 'finbert_score']])