In [4]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import ast
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

### Prepare dataset

In [5]:
df=pd.read_csv('dataset/SEntFiN-v1.1.csv')
df

Unnamed: 0,S No.,Title,Decisions,Words
0,1,SpiceJet to issue 6.4 crore warrants to promoters,"{""SpiceJet"": ""neutral""}",8
1,2,MMTC Q2 net loss at Rs 10.4 crore,"{""MMTC"": ""neutral""}",8
2,3,"Mid-cap funds can deliver more, stay put: Experts","{""Mid-cap funds"": ""positive""}",8
3,4,Mid caps now turn into market darlings,"{""Mid caps"": ""positive""}",7
4,5,"Market seeing patience, if not conviction: Pra...","{""Market"": ""neutral""}",8
...,...,...,...,...
10748,10749,"Negative on Chambal, Advanta: Mitesh Thacker","{""Chambal"": ""negative"", ""Advanta"": ""negative""}",6
10749,10750,"Small, Mid-cap stocks may emerge outperformers","{""Small"": ""positive"", ""Mid-cap stocks"": ""posit...",6
10750,10751,Rupee slips against US dollar,"{""Rupee"": ""negative"", ""US dollar"": ""neutral""}",5
10751,10752,Rupee weak against US dollar,"{""Rupee"": ""negative"", ""US dollar"": ""neutral""}",5


In [6]:
# Convert 'Decisions' column from strings to dictionaries
df['Decisions'] = df['Decisions'].apply(ast.literal_eval)
# Normalize the 'Decisions' column
decisions_df = df['Decisions'].apply(pd.Series)
decisions_df

Unnamed: 0,SpiceJet,MMTC,Mid-cap funds,Mid caps,Market,Infosys,Hudco,HOEC,Gold,Silver,...,pharmaceutical stocks,WhatsApp,BBM,FB,Indian cyclicals,Metro,"Masoor, gram",Wincor,masoor,Chambal
0,neutral,,,,,,,,,,...,,,,,,,,,,
1,,neutral,,,,,,,,,...,,,,,,,,,,
2,,,positive,,,,,,,,...,,,,,,,,,,
3,,,,positive,,,,,,,...,,,,,,,,,,
4,,,,,neutral,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10748,,,,,,,,,,,...,,,,,,,,,,negative
10749,,,,,,,,,,,...,,,,,,,,,,
10750,,,,,,,,,,,...,,,,,,,,,,
10751,,,,,,,,,,,...,,,,,,,,,,


In [7]:
# Concatenate the original DataFrame with the normalized decisions
df_expanded = pd.concat([df.drop(columns='Decisions'), decisions_df], axis=1)
df_expanded

Unnamed: 0,S No.,Title,Words,SpiceJet,MMTC,Mid-cap funds,Mid caps,Market,Infosys,Hudco,...,pharmaceutical stocks,WhatsApp,BBM,FB,Indian cyclicals,Metro,"Masoor, gram",Wincor,masoor,Chambal
0,1,SpiceJet to issue 6.4 crore warrants to promoters,8,neutral,,,,,,,...,,,,,,,,,,
1,2,MMTC Q2 net loss at Rs 10.4 crore,8,,neutral,,,,,,...,,,,,,,,,,
2,3,"Mid-cap funds can deliver more, stay put: Experts",8,,,positive,,,,,...,,,,,,,,,,
3,4,Mid caps now turn into market darlings,7,,,,positive,,,,...,,,,,,,,,,
4,5,"Market seeing patience, if not conviction: Pra...",8,,,,,neutral,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10748,10749,"Negative on Chambal, Advanta: Mitesh Thacker",6,,,,,,,,...,,,,,,,,,,negative
10749,10750,"Small, Mid-cap stocks may emerge outperformers",6,,,,,,,,...,,,,,,,,,,
10750,10751,Rupee slips against US dollar,5,,,,,,,,...,,,,,,,,,,
10751,10752,Rupee weak against US dollar,5,,,,,,,,...,,,,,,,,,,


In [8]:
# Melt the expanded DataFrame to long format
df_long = df_expanded.melt(
    id_vars=["S No.", "Title", "Words"],
    var_name="Aspect",
    value_name="Sentiment"
).dropna(subset=["Sentiment"])
df_long

Unnamed: 0,S No.,Title,Words,Aspect,Sentiment
0,1,SpiceJet to issue 6.4 crore warrants to promoters,8,SpiceJet,neutral
82,83,SpiceJet makes top-level changes,4,SpiceJet,neutral
1509,1510,Maran makes open offer for SpiceJet,6,SpiceJet,positive
1511,1512,"SpiceJet is a buy: CK Narayan, Sharyans Resources",8,SpiceJet,positive
1513,1514,SpiceJet: Regional focus can help airline soar...,8,SpiceJet,positive
...,...,...,...,...,...
45926038,10729,"German shares outperform on Metro, Deutsche Bank",7,Metro,positive
45936795,10733,"Masoor, gram prices weaken on sluggish demand",7,"Masoor, gram",negative
45947561,10746,European shares dip early; Wincor sinks,6,Wincor,negative
45958315,10747,"Gram, masoor decline on subdued demand",6,masoor,negative


In [9]:
# Select relevant columns for classification
df_absa = df_long[["Title", "Aspect", "Sentiment"]]
df_absa

Unnamed: 0,Title,Aspect,Sentiment
0,SpiceJet to issue 6.4 crore warrants to promoters,SpiceJet,neutral
82,SpiceJet makes top-level changes,SpiceJet,neutral
1509,Maran makes open offer for SpiceJet,SpiceJet,positive
1511,"SpiceJet is a buy: CK Narayan, Sharyans Resources",SpiceJet,positive
1513,SpiceJet: Regional focus can help airline soar...,SpiceJet,positive
...,...,...,...
45926038,"German shares outperform on Metro, Deutsche Bank",Metro,positive
45936795,"Masoor, gram prices weaken on sluggish demand","Masoor, gram",negative
45947561,European shares dip early; Wincor sinks,Wincor,negative
45958315,"Gram, masoor decline on subdued demand",masoor,negative


### CNN Classifier for sentiment analysis

In [10]:
# Menggabungkan Title dan Aspect sebagai fitur teks
df_absa['Text'] = df_absa['Title']

# Encode labels
label_encoder = LabelEncoder()
df_absa['Label'] = label_encoder.fit_transform(df_absa['Sentiment'])
df_absa

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_absa['Text'] = df_absa['Title']


Unnamed: 0,Title,Aspect,Sentiment,Text,Label
0,SpiceJet to issue 6.4 crore warrants to promoters,SpiceJet,neutral,SpiceJet to issue 6.4 crore warrants to promoters,1
82,SpiceJet makes top-level changes,SpiceJet,neutral,SpiceJet makes top-level changes,1
1509,Maran makes open offer for SpiceJet,SpiceJet,positive,Maran makes open offer for SpiceJet,2
1511,"SpiceJet is a buy: CK Narayan, Sharyans Resources",SpiceJet,positive,"SpiceJet is a buy: CK Narayan, Sharyans Resources",2
1513,SpiceJet: Regional focus can help airline soar...,SpiceJet,positive,SpiceJet: Regional focus can help airline soar...,2
...,...,...,...,...,...
45926038,"German shares outperform on Metro, Deutsche Bank",Metro,positive,"German shares outperform on Metro, Deutsche Bank",2
45936795,"Masoor, gram prices weaken on sluggish demand","Masoor, gram",negative,"Masoor, gram prices weaken on sluggish demand",0
45947561,European shares dip early; Wincor sinks,Wincor,negative,European shares dip early; Wincor sinks,0
45958315,"Gram, masoor decline on subdued demand",masoor,negative,"Gram, masoor decline on subdued demand",0


In [11]:
X = df_absa['Text']
y = df_absa['Label']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, max_len=50):
        self.texts = texts
        self.labels = labels
        self.max_len = max_len
        self.vocab = self.build_vocab()

    def build_vocab(self):
        vocab = set()
        for text in self.texts:
            for word in text.split():
                vocab.add(word)
        word_to_idx = {word: idx + 1 for idx, word in enumerate(vocab)}  # +1 to reserve 0 for padding
        word_to_idx['<PAD>'] = 0
        return word_to_idx

    def text_to_sequence(self, text):
        sequence = [self.vocab.get(word, 0) for word in text.split()]
        if len(sequence) < self.max_len:
            sequence.extend([0] * (self.max_len - len(sequence)))
        else:
            sequence = sequence[:self.max_len]
        return sequence

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        sequence = self.text_to_sequence(text)
        return torch.tensor(sequence), torch.tensor(label, dtype=torch.long)

In [13]:
# Create DataLoaders
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

In [14]:
# Model
class CNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_classes):
        super(CNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.conv1 = nn.Conv2d(1, 100, (3, embed_size))
        self.pool = nn.MaxPool2d((2, 1))
        self.fc1 = nn.Linear(100 * ((50 - 3 + 1) // 2), num_classes)

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)
        x = torch.relu(self.conv1(x)).squeeze(3)
        x = self.pool(x).squeeze(2)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        return x

In [25]:
# Hyperparameters
vocab_size = len(train_dataset.vocab) + 1
embed_size = 50
num_classes = len(label_encoder.classes_)
learning_rate = 0.001
num_epochs = 10

In [26]:
# run with cuda 12
# pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [27]:
torch.cuda.is_available()

True

In [28]:
# Initialize model, loss function, and optimizer
model = CNNModel(vocab_size, embed_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


In [29]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/10], Loss: 0.5455
Epoch [2/10], Loss: 2.4392
Epoch [3/10], Loss: 0.2870
Epoch [4/10], Loss: 0.0420
Epoch [5/10], Loss: 0.4002
Epoch [6/10], Loss: 0.0782
Epoch [7/10], Loss: 0.0145
Epoch [8/10], Loss: 0.0337
Epoch [9/10], Loss: 0.0580
Epoch [10/10], Loss: 0.0121


In [30]:
# Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
report = classification_report(all_labels, all_preds, target_names=label_encoder.classes_)

print(f'Accuracy: {accuracy:.4f}')
print(f'Classification Report:\n{report}')


Accuracy: 0.3557
Classification Report:
              precision    recall  f1-score   support

    negative       0.28      0.14      0.19       763
     neutral       0.37      0.68      0.48      1090
    positive       0.36      0.17      0.23      1029

    accuracy                           0.36      2882
   macro avg       0.34      0.33      0.30      2882
weighted avg       0.34      0.36      0.31      2882

