In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import ast
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

### Prepare dataset

In [2]:
df=pd.read_csv('dataset/SEntFiN-v1.1.csv')
df.head()

Unnamed: 0,S No.,Title,Decisions,Words
0,1,SpiceJet to issue 6.4 crore warrants to promoters,"{""SpiceJet"": ""neutral""}",8
1,2,MMTC Q2 net loss at Rs 10.4 crore,"{""MMTC"": ""neutral""}",8
2,3,"Mid-cap funds can deliver more, stay put: Experts","{""Mid-cap funds"": ""positive""}",8
3,4,Mid caps now turn into market darlings,"{""Mid caps"": ""positive""}",7
4,5,"Market seeing patience, if not conviction: Pra...","{""Market"": ""neutral""}",8


In [3]:
# Convert 'Decisions' column from strings to dictionaries
df['Decisions'] = df['Decisions'].apply(ast.literal_eval)
# Normalize the 'Decisions' column
decisions_df = df['Decisions'].apply(pd.Series)
decisions_df.head()

Unnamed: 0,SpiceJet,MMTC,Mid-cap funds,Mid caps,Market,Infosys,Hudco,HOEC,Gold,Silver,...,pharmaceutical stocks,WhatsApp,BBM,FB,Indian cyclicals,Metro,"Masoor, gram",Wincor,masoor,Chambal
0,neutral,,,,,,,,,,...,,,,,,,,,,
1,,neutral,,,,,,,,,...,,,,,,,,,,
2,,,positive,,,,,,,,...,,,,,,,,,,
3,,,,positive,,,,,,,...,,,,,,,,,,
4,,,,,neutral,,,,,,...,,,,,,,,,,


In [4]:
# Concatenate the original DataFrame with the normalized decisions
df_expanded = pd.concat([df.drop(columns='Decisions'), decisions_df], axis=1)
df_expanded.head()

Unnamed: 0,S No.,Title,Words,SpiceJet,MMTC,Mid-cap funds,Mid caps,Market,Infosys,Hudco,...,pharmaceutical stocks,WhatsApp,BBM,FB,Indian cyclicals,Metro,"Masoor, gram",Wincor,masoor,Chambal
0,1,SpiceJet to issue 6.4 crore warrants to promoters,8,neutral,,,,,,,...,,,,,,,,,,
1,2,MMTC Q2 net loss at Rs 10.4 crore,8,,neutral,,,,,,...,,,,,,,,,,
2,3,"Mid-cap funds can deliver more, stay put: Experts",8,,,positive,,,,,...,,,,,,,,,,
3,4,Mid caps now turn into market darlings,7,,,,positive,,,,...,,,,,,,,,,
4,5,"Market seeing patience, if not conviction: Pra...",8,,,,,neutral,,,...,,,,,,,,,,


In [5]:
# Melt the expanded DataFrame to long format
df_long = df_expanded.melt(
    id_vars=["S No.", "Title", "Words"],
    var_name="Aspect",
    value_name="Sentiment"
).dropna(subset=["Sentiment"])
df_long.head()

Unnamed: 0,S No.,Title,Words,Aspect,Sentiment
0,1,SpiceJet to issue 6.4 crore warrants to promoters,8,SpiceJet,neutral
82,83,SpiceJet makes top-level changes,4,SpiceJet,neutral
1509,1510,Maran makes open offer for SpiceJet,6,SpiceJet,positive
1511,1512,"SpiceJet is a buy: CK Narayan, Sharyans Resources",8,SpiceJet,positive
1513,1514,SpiceJet: Regional focus can help airline soar...,8,SpiceJet,positive


In [6]:
# Select relevant columns for classification
df_absa = df_long[["Title", "Aspect", "Sentiment"]]
df_absa.head()

Unnamed: 0,Title,Aspect,Sentiment
0,SpiceJet to issue 6.4 crore warrants to promoters,SpiceJet,neutral
82,SpiceJet makes top-level changes,SpiceJet,neutral
1509,Maran makes open offer for SpiceJet,SpiceJet,positive
1511,"SpiceJet is a buy: CK Narayan, Sharyans Resources",SpiceJet,positive
1513,SpiceJet: Regional focus can help airline soar...,SpiceJet,positive


### CNN Classifier for sentiment analysis

In [7]:
# Menggabungkan Title dan Aspect sebagai fitur teks
df_absa['Text'] = df_absa['Title']

# Encode labels
label_encoder = LabelEncoder()
df_absa['Label'] = label_encoder.fit_transform(df_absa['Sentiment'])
df_absa

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_absa['Text'] = df_absa['Title']


Unnamed: 0,Title,Aspect,Sentiment,Text,Label
0,SpiceJet to issue 6.4 crore warrants to promoters,SpiceJet,neutral,SpiceJet to issue 6.4 crore warrants to promoters,1
82,SpiceJet makes top-level changes,SpiceJet,neutral,SpiceJet makes top-level changes,1
1509,Maran makes open offer for SpiceJet,SpiceJet,positive,Maran makes open offer for SpiceJet,2
1511,"SpiceJet is a buy: CK Narayan, Sharyans Resources",SpiceJet,positive,"SpiceJet is a buy: CK Narayan, Sharyans Resources",2
1513,SpiceJet: Regional focus can help airline soar...,SpiceJet,positive,SpiceJet: Regional focus can help airline soar...,2
...,...,...,...,...,...
45926038,"German shares outperform on Metro, Deutsche Bank",Metro,positive,"German shares outperform on Metro, Deutsche Bank",2
45936795,"Masoor, gram prices weaken on sluggish demand","Masoor, gram",negative,"Masoor, gram prices weaken on sluggish demand",0
45947561,European shares dip early; Wincor sinks,Wincor,negative,European shares dip early; Wincor sinks,0
45958315,"Gram, masoor decline on subdued demand",masoor,negative,"Gram, masoor decline on subdued demand",0


In [8]:
X = df_absa['Text']
y = df_absa['Label']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, max_len=50):
        self.texts = texts
        self.labels = labels
        self.max_len = max_len
        self.vocab = self.build_vocab()

    def build_vocab(self):
        vocab = set()
        for text in self.texts:
            for word in text.split():
                vocab.add(word)
        word_to_idx = {word: idx + 1 for idx, word in enumerate(vocab)}  # +1 to reserve 0 for padding
        word_to_idx['<PAD>'] = 0
        return word_to_idx

    def text_to_sequence(self, text):
        sequence = [self.vocab.get(word, 0) for word in text.split()]
        if len(sequence) < self.max_len:
            sequence.extend([0] * (self.max_len - len(sequence)))
        else:
            sequence = sequence[:self.max_len]
        return sequence

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        sequence = self.text_to_sequence(text)
        return torch.tensor(sequence), torch.tensor(label, dtype=torch.long)

In [10]:
# Create DataLoaders
train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

In [11]:
# Model
class CNNModel(nn.Module):
    def __init__(self, vocab_size, embed_size, num_classes):
        super(CNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size, padding_idx=0)
        self.conv1 = nn.Conv2d(1, 100, (3, embed_size))
        self.pool = nn.MaxPool2d((2, 1))
        self.fc1 = nn.Linear(100 * ((50 - 3 + 1) // 2), num_classes)

    def forward(self, x):
        x = self.embedding(x).unsqueeze(1)
        x = torch.relu(self.conv1(x)).squeeze(3)
        x = self.pool(x).squeeze(2)
        x = x.view(x.size(0), -1)
        x = self.fc1(x)
        return x

In [12]:
# Hyperparameters
vocab_size = len(train_dataset.vocab) + 1
embed_size = 50
num_classes = len(label_encoder.classes_)
learning_rate = 0.001
num_epochs = 100

In [13]:
# run with cuda 12
# pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [14]:
torch.cuda.is_available()

True

In [15]:
# Initialize model, loss function, and optimizer
model = CNNModel(vocab_size, embed_size, num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)


  from .autonotebook import tqdm as notebook_tqdm


In [16]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

  return F.conv2d(input, weight, bias, self.stride,


Epoch [1/100], Loss: 1.9405
Epoch [2/100], Loss: 0.1224
Epoch [3/100], Loss: 0.9807
Epoch [4/100], Loss: 0.3584
Epoch [5/100], Loss: 0.5841
Epoch [6/100], Loss: 0.1488
Epoch [7/100], Loss: 0.0123
Epoch [8/100], Loss: 0.1180
Epoch [9/100], Loss: 1.2668
Epoch [10/100], Loss: 0.4084
Epoch [11/100], Loss: 0.9629
Epoch [12/100], Loss: 1.0413
Epoch [13/100], Loss: 0.0017
Epoch [14/100], Loss: 1.2036
Epoch [15/100], Loss: 0.0250
Epoch [16/100], Loss: 0.0097
Epoch [17/100], Loss: 0.0035
Epoch [18/100], Loss: 0.5586
Epoch [19/100], Loss: 0.4870
Epoch [20/100], Loss: 0.0019
Epoch [21/100], Loss: 0.1664
Epoch [22/100], Loss: 0.0519
Epoch [23/100], Loss: 0.0263
Epoch [24/100], Loss: 0.2113
Epoch [25/100], Loss: 0.1082
Epoch [26/100], Loss: 0.1771
Epoch [27/100], Loss: 0.1520
Epoch [28/100], Loss: 0.1043
Epoch [29/100], Loss: 0.2006
Epoch [30/100], Loss: 0.0422
Epoch [31/100], Loss: 0.0108
Epoch [32/100], Loss: 1.1933
Epoch [33/100], Loss: 0.0480
Epoch [34/100], Loss: 0.0021
Epoch [35/100], Loss: 0

In [17]:
# Evaluation
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for texts, labels in test_loader:
        texts, labels = texts.to(device), labels.to(device)
        outputs = model(texts)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(all_labels, all_preds)
report = classification_report(all_labels, all_preds, target_names=label_encoder.classes_)

print(f'Accuracy: {accuracy:.4f}')
print(f'Classification Report:\n{report}')


Accuracy: 0.3133
Classification Report:
              precision    recall  f1-score   support

    negative       0.26      0.43      0.32       763
     neutral       0.36      0.32      0.34      1090
    positive       0.36      0.22      0.27      1029

    accuracy                           0.31      2882
   macro avg       0.32      0.32      0.31      2882
weighted avg       0.33      0.31      0.31      2882

