In [6]:
from sklearn.model_selection import train_test_split
from datasets import load_dataset
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
# Template for the experiment

## Use this as training data
ds = load_dataset("StephanAkkerman/financial-tweets-crypto")
df = pd.DataFrame(ds['train'])


In [11]:
# Removing invalid data

# Drop rows with missing sentiment
df = df[~df['sentiment'].isna()]

# merge the sentiments to either Neutral, Bullish, Bearish only
df['sentiment'] = df['sentiment'].str.split().str[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = df['sentiment'].str.split().str[0]


In [12]:
assert df['sentiment'].isnull().sum() == 0

In [15]:
# Make sure the sentiment values are valid
# Define expected sentiment values
valid_sentiments = ['Neutral', 'Bullish', 'Bearish']

# Assert that all sentiments are valid
assert df['sentiment'].isin(valid_sentiments).all(), "Found unexpected sentiment values"

# Verify unique values (optional)
print("Unique sentiment values:", df['sentiment'].unique())

Unique sentiment values: ['Bullish' 'Neutral' 'Bearish']


## If using pandas dataframe

In [None]:
# Create indices for train-test split
indices = np.arange(len(df))
train_idx, test_idx = train_test_split(indices, test_size=0.2, random_state=42)

# Create train and test dataframes
df_train = df.iloc[train_idx].reset_index(drop=True)
df_test = df.iloc[test_idx].reset_index(drop=True)

# Verify the split
print(f"Training set shape: {df_train.shape}")
print(f"Testing set shape: {df_test.shape}")


Training set shape: (38953, 13)
Testing set shape: (9739, 13)


## If using huggingfacedataset

In [18]:
from datasets import Dataset

# Convert pandas DataFrames to HuggingFace Datasets
hf_train = Dataset.from_pandas(df_train)
hf_test = Dataset.from_pandas(df_test)

# Optional: Create a DatasetDict for easier handling
from datasets import DatasetDict
dataset_dict = DatasetDict({
    'train': hf_train,
    'test': hf_test
})

# Verify the complete dataset
print("\nComplete Dataset Dictionary:")
print(dataset_dict)


Complete Dataset Dictionary:
DatasetDict({
    train: Dataset({
        features: ['image_url', 'proxy_image_url', 'image_dimensions', 'thumbnail_url', 'proxy_thumbnail_url', 'thumbnail_dimensions', 'timestamp', 'description', 'url', 'embed_title', 'tweet_type', 'financial_info', 'sentiment'],
        num_rows: 38953
    })
    test: Dataset({
        features: ['image_url', 'proxy_image_url', 'image_dimensions', 'thumbnail_url', 'proxy_thumbnail_url', 'thumbnail_dimensions', 'timestamp', 'description', 'url', 'embed_title', 'tweet_type', 'financial_info', 'sentiment'],
        num_rows: 9739
    })
})


<div style="background-color: #ffffd4; padding: 10px; border-radius: 5px;">
<strong>⚠️ Important:</strong> All experiments should use the above df_train and df_test generated from the code above!
</div>