In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import os

# Download NLTK resources if not already available
import nltk
nltk.download('stopwords')
nltk.download('punkt_tab')

# For reproducibility
np.random.seed(120)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saif\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\saif\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
# Define paths
data_dir = "../../data/sentiment_analysis/raw/"
data_path = os.path.join(data_dir, "sentiment_data.csv")

# Load the dataset
df = pd.read_csv(data_path)

# Preview the dataset
print("Dataset Preview:")
print(df.head())

# Check dataset information
print("\nDataset Info:")
print(df.info())


Dataset Preview:
                                                text  label
0  $BYND - JPMorgan reels in expectations on Beyo...      0
1  $CCL $RCL - Nomura points to bookings weakness...      0
2  $CX - Cemex cut at Credit Suisse, J.P. Morgan ...      0
3  $ESS: BTIG Research cuts to Neutral https://t....      0
4  $FNKO - Funko slides after Piper Jaffray PT cu...      0

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11931 entries, 0 to 11930
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    11931 non-null  object
 1   label   11931 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 186.6+ KB
None


In [3]:
# Define text cleaning function
def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove non-alphabetic characters
    text = text.lower()  # Convert to lowercase
    text = word_tokenize(text)  # Tokenize text
    text = [word for word in text if word not in stopwords.words('english')]  # Remove stopwords
    return ' '.join(text)

# Apply text cleaning
df['cleaned_text'] = df['text'].apply(clean_text)

# Preview cleaned text
print("\nCleaned Text Example:")
print(df[['text', 'cleaned_text']].head())



Cleaned Text Example:
                                                text  \
0  $BYND - JPMorgan reels in expectations on Beyo...   
1  $CCL $RCL - Nomura points to bookings weakness...   
2  $CX - Cemex cut at Credit Suisse, J.P. Morgan ...   
3  $ESS: BTIG Research cuts to Neutral https://t....   
4  $FNKO - Funko slides after Piper Jaffray PT cu...   

                                        cleaned_text  
0       bynd jpmorgan reels expectations beyond meat  
1  ccl rcl nomura points bookings weakness carniv...  
2  cx cemex cut credit suisse jp morgan weak buil...  
3                     ess btig research cuts neutral  
4             fnko funko slides piper jaffray pt cut  


In [8]:
# Initialize tokenizer
max_words = 10000  # Vocabulary size
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['cleaned_text'])

# Convert text to sequences
df['text_sequences'] = tokenizer.texts_to_sequences(df['cleaned_text'])

# Save tokenizer for future use
tokenizer_path = os.path.join("../../data/sentiment_analysis/processed/", "tokenizer.json")
os.makedirs(os.path.dirname(tokenizer_path), exist_ok=True)
with open(tokenizer_path, 'w') as f:
    f.write(tokenizer.to_json())

print("\nTokenizer saved successfully!")



Tokenizer saved successfully!


In [5]:
# Define maximum sequence length
max_length = 100

# Pad sequences
df['padded_sequences'] = list(pad_sequences(df['text_sequences'], maxlen=max_length, padding='post'))

# Verify padding
print("\nPadded Sequence Example:")
print(df[['cleaned_text', 'padded_sequences']].head())



Padded Sequence Example:
                                        cleaned_text  \
0       bynd jpmorgan reels expectations beyond meat   
1  ccl rcl nomura points bookings weakness carniv...   
2  cx cemex cut credit suisse jp morgan weak buil...   
3                     ess btig research cuts neutral   
4             fnko funko slides piper jaffray pt cut   

                                    padded_sequences  
0  [2842, 717, 5568, 257, 686, 981, 0, 0, 0, 0, 0...  
1  [4189, 4190, 1954, 117, 5569, 1119, 2843, 835,...  
2  [8304, 8305, 68, 84, 517, 982, 270, 562, 1047,...  
3  [5570, 1765, 503, 103, 464, 0, 0, 0, 0, 0, 0, ...  
4  [8306, 2180, 1197, 1379, 2181, 1499, 68, 0, 0,...  


In [6]:
# Define features (X) and labels (y)
X = np.array(list(df['padded_sequences']))
y = df['label'].values

# Split into training, validation, and test sets (80% / 10% / 10%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

# Print split sizes
print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")



Training set: 9544 samples
Validation set: 1193 samples
Test set: 1194 samples


In [7]:
# Save splits
processed_dir = "../../data/sentiment_analysis/processed/"
os.makedirs(processed_dir, exist_ok=True)

# Save as .npy for numpy arrays
np.save(os.path.join(processed_dir, "X_train.npy"), X_train)
np.save(os.path.join(processed_dir, "y_train.npy"), y_train)
np.save(os.path.join(processed_dir, "X_val.npy"), X_val)
np.save(os.path.join(processed_dir, "y_val.npy"), y_val)
np.save(os.path.join(processed_dir, "X_test.npy"), X_test)
np.save(os.path.join(processed_dir, "y_test.npy"), y_test)

print("\nProcessed data saved successfully!")



Processed data saved successfully!


### Summary of Data Preparation

1. **Loaded and Cleaned Data:**
   - The dataset contains `X` 11931 with three sentiment labels: Bearish, Bullish, and Neutral.
   - Text was cleaned by removing URLs, special characters, and stopwords.

2. **Tokenized and Padded Text:**
   - Text was tokenized using a vocabulary of 10,000 words.
   - Padded sequences were created with a fixed length of 100 tokens.

3. **Split Data:**
   - Training set: N 9544
   - Validation set: M 1193 
   - Test set: P 1194 

4. **Saved Processed Data:**
   - Tokenizer and datasets were saved for future use.

---

### Next Steps
- Proceed to `3_model_training.ipynb` to build and train neural network models for sentiment classification.

