In [34]:
!pip install -q kaggle
!pip install wandb



In [38]:
wandb.login()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

##Import Libraries

In [40]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import nltk
import string
import re
import wandb
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import layers, models

In [3]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download and unzip the dataset
try:
    # Check if the dataset file exists before downloading
    if not os.path.exists('/content/finanical-sentiment-analysis'):
        !kaggle datasets download -d veer1516/finanical-sentiment-analysis

    # Check if the dataset directory exists before unzipping
    if not os.path.exists('/content/dataset'):
        !mkdir /content/dataset

    !unzip -q '/content/finanical-sentiment-analysis.zip' -d '/content/dataset/'

    print("Dataset downloaded and unzipped successfully.")
except Exception as e:
    print("Error:", e)

Dataset URL: https://www.kaggle.com/datasets/veer1516/finanical-sentiment-analysis
License(s): Attribution-NonCommercial 4.0 International (CC BY-NC 4.0)
Downloading finanical-sentiment-analysis.zip to /content
 94% 73.0M/77.4M [00:00<00:00, 93.6MB/s]
100% 77.4M/77.4M [00:00<00:00, 87.1MB/s]
Dataset downloaded and unzipped successfully.


##Constants


In [4]:
BATCH_SIZE = 64

MAX_FEATURES = 10000
SEQ_LENGTH = 250
EMBEDDING_DIM = 300

DS_PATH = '/content/dataset/stock_data_verbose.csv'

##Data Preprocessing and Preparation

In [5]:
df = pd.read_csv(DS_PATH)
df.head()

Unnamed: 0,Text,Sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [6]:
label_encoder = LabelEncoder()
df['Sentiment'] = label_encoder.fit_transform(df['Sentiment'])

print(df['Sentiment'].unique())

[1 0]


In [17]:
df

Unnamed: 0,Text,Sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
51937,I'm going to have to disagree with the previou...,0
51938,No one expects the Star Trek movies to be high...,0
51939,Aptus Value Housing falls on weak operating pe...,0
51940,U.S. Withdraws Offer To Buy 6 Million Barrels ...,0


In [7]:
train_df, temp_df = train_test_split(df, test_size=0.2, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [8]:
print(f'Training set size: {len(train_df)}')
print(f'Validation set size: {len(val_df)}')
print(f'Test set size: {len(test_df)}')

Training set size: 41553
Validation set size: 5194
Test set size: 5195


In [9]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [10]:
def standardize(text):
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)

    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))

    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]

    text = ' '.join(tokens)
    return text

In [11]:
# Apply preprocessing to the text column in each split
train_df['Text'] = train_df['Text'].apply(standardize)
val_df['Text'] = val_df['Text'].apply(standardize)
test_df['Text'] = test_df['Text'].apply(standardize)

In [12]:
def df_to_tfds(dataframe, shuffle=False, batch_size=32):
    df = dataframe.copy()
    labels = df.pop('Sentiment')
    ds = tf.data.Dataset.from_tensor_slices((df['Text'].values, labels.values))
    if shuffle==True:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size=batch_size)
    return ds

In [13]:
train_ds = df_to_tfds(train_df, shuffle=True)
val_ds = df_to_tfds(val_df)
test_ds = df_to_tfds(test_df)

In [14]:
for a,b in train_ds.take(1):
    print(a)
    print(b)

tf.Tensor(
[b'ordinarily anthony mann made western big guy james stewart gary cooper henry fonda list cowboy star b film tackled something notably different quite bit success turned truly one kind western main character played victor mature trapper mountain man ordinarily romanticized film robert redford jeremiah johnson sort thing hero fact typical mountain man clean cut heroic figure hang real mountain men true mountain man vulgar crude animalistic central figure something see giving mature one better later role real acting chop provided robert preston excellent selfabsorbed custer type cavalry commander james whitmore poor man spencer tracy another old timer feel trapped ever hostile indian one side oncoming force civilization even impressive young anne bancroft officer wife initially repulsed sight matures grisly character find veneer civilization slipping away begin realize shock shes attracted rarely ever remote frontier fort accurately realized screen without romantic allure joh

In [17]:
vectorize_layer = TextVectorization(max_tokens=MAX_FEATURES,
                                    output_mode='int',
                                    output_sequence_length=SEQ_LENGTH)

In [18]:
# Adapt the vectorize layer to the training text data
train_text = train_df['Text'].values
vectorize_layer.adapt(train_text)

In [22]:
len(vectorize_layer.get_vocabulary())

10000

In [25]:
vectorize_layer.get_vocabulary()[555]

'message'

In [19]:
def vectorize_text(text, label):
    text = tf.expand_dims(text, -1)
    return vectorize_layer(text), label

In [20]:
train_ds = train_ds.map(vectorize_text)
val_ds = val_ds.map(vectorize_text)
test_ds = test_ds.map(vectorize_text)

In [26]:
for a,b in train_ds.take(1):
    print(a)
    print(b)

tf.Tensor(
[[ 196  193 1013 ...    0    0    0]
 [ 121 3060 2740 ...    0    0    0]
 [ 121    2  887 ...    0    0    0]
 ...
 [  40   27 1675 ...    0    0    0]
 [  15   15   22 ...    0    0    0]
 [ 550   71    1 ...    0    0    0]], shape=(32, 250), dtype=int64)
tf.Tensor([0 1 0 0 0 1 1 1 0 1 1 0 1 1 1 0 0 1 1 0 0 1 0 0 1 1 0 1 1 1 0 0], shape=(32,), dtype=int64)


In [30]:
train_ds = train_ds.prefetch(buffer_size=tf.data.AUTOTUNE)
val_ds = val_ds.prefetch(buffer_size=tf.data.AUTOTUNE)

###Track the data on wandb

####Initialize project

In [41]:
wandb.init(project='fin-sentiment-analysis', entity='petar-boskovic-ac')

[34m[1mwandb[0m: Currently logged in as: [33mpetar-boskovic-ac[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [43]:
# Log the initial dataset as a wandb artifact
artifact = wandb.Artifact('stock_data', type='dataset')
artifact.add_file('/content/dataset/stock_data_verbose.csv')
wandb.log_artifact(artifact)

<Artifact stock_data>

Track dataset transformations in wandb

In [44]:
# Log preprocessing steps as metadata
wandb.config.preprocessing = {'applied_after_split': True,
                              'lowercase': True,
                              'remove_stopwords': True,
                              'lemmatization': True,
                              'remove_punctuation': True,
                              'remove_html_tags': True}

In [45]:
# Save the transformed splits
train_csv_path = '/content/dataset/stock_data_train_transformed.csv'
val_csv_path = '/content/dataset/stock_data_val_transformed.csv'
test_csv_path = '/content/dataset/stock_data_test_transformed.csv'

train_df.to_csv(train_csv_path, index=False)
val_df.to_csv(val_csv_path, index=False)
test_df.to_csv(test_csv_path, index=False)

In [46]:
# Create separate W&B artifacts for each split
train_artifact = wandb.Artifact('stock_data_train_transformed', type='dataset')
val_artifact = wandb.Artifact('stock_data_val_transformed', type='dataset')
test_artifact = wandb.Artifact('stock_data_test_transformed', type='dataset')

In [47]:
# Add the transformed splits to the corresponding artifacts
train_artifact.add_file(train_csv_path)
val_artifact.add_file(val_csv_path)
test_artifact.add_file(test_csv_path)

# Log the artifacts
wandb.log_artifact(train_artifact)
wandb.log_artifact(val_artifact)
wandb.log_artifact(test_artifact)

<Artifact stock_data_test_transformed>

In [48]:
wandb.config.data_split = {'train_split': 0.8,
                           'validation_split': 0.1,
                           'test_split': 0.1}

In [49]:
wandb.finish()

VBox(children=(Label(value='102.897 MB of 102.897 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

####Reinitialize W&B

In [None]:
wandb.init(project="financial-sentiment-analysis", entity="petar-boskovic-ac")

In order to avoid duplicate artifacts, check first if the artifact exists

In [None]:
artifacts = ['stock_data_train_transformed',
             'stock_data_val_transformed',
             'stock_data_test_transformed']

In [None]:
try:
    train_artifact = wandb.use_artifact('stock_data_train_transformed:latest', type='dataset')
    val_artifact = wandb.use_artifact('stock_data_val_transformed:latest', type='dataset')
    test_artifact = wandb.use_artifact('stock_data_test_transformed:latest', type='dataset')
    print('Artifact already exists. Skipping upload.')
except wandb.errors.CommError:
    # Create and log the artifact if it does not exist
    train_artifact = wandb.Artifact('stock_data_train_transformed', type='dataset')
    val_artifact = wandb.Artifact('stock_data_val_transformed', type='dataset')
    test_artifact = wandb.Artifact('stock_data_test_transformed', type='dataset')

    # Add the transformed splits to the corresponding artifacts
    train_artifact.add_file(train_csv_path)
    val_artifact.add_file(val_csv_path)
    test_artifact.add_file(test_csv_path)

    # Log the artifacts
    wandb.log_artifact(train_artifact)
    wandb.log_artifact(val_artifact)
    wandb.log_artifact(test_artifact)

Download the artifacts

In [None]:
train_artifact = wandb.use_artifact('stock_data_train_transformed:latest', type='dataset')
val_artifact = wandb.use_artifact('stock_data_val_transformed:latest', type='dataset')
test_artifact = wandb.use_artifact('stock_data_test_transformed:latest', type='dataset')

train_dir = train_artifact.download()
val_dir = val_artifact.download()
test_dir = test_artifact.download()