In [1]:
import pandas as pd
import nltk
import demoji
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import unicodedata as uni
import re

from tqdm import tqdm
tqdm.pandas()

In [2]:
sentfin = pd.read_csv('dataset/SEntFiN-v1.1.csv', encoding="latin-1")
sentfin.head()

Unnamed: 0,S No.,Title,Decisions,Words
0,1,SpiceJet to issue 6.4 crore warrants to promoters,"{""SpiceJet"": ""neutral""}",8
1,2,MMTC Q2 net loss at Rs 10.4 crore,"{""MMTC"": ""neutral""}",8
2,3,"Mid-cap funds can deliver more, stay put: Experts","{""Mid-cap funds"": ""positive""}",8
3,4,Mid caps now turn into market darlings,"{""Mid caps"": ""positive""}",7
4,5,"Market seeing patience, if not conviction: Pra...","{""Market"": ""neutral""}",8


In [3]:
candi = pd.read_excel('dataset/reviews_borobudur_prambanan_TripAdvisor_GMaps_all_tesis.xlsx')
candi_clean = candi.dropna()
# print("\nDrop rows with any NaN values:")
candi_clean.head()

Unnamed: 0,id,lokasi,text,daya_tarik,amenitas,aksesibilitas,citra,harga,sdm
0,1.0,Candi Borobudur,peninggalan sejarah yang sudah berumur 1200 ta...,1,-,-,1,0,0
1,2.0,Candi Borobudur,Pertama kali bepergian selama masa pandemi. Ca...,-,1,-,1,-,1
2,3.0,Candi Borobudur,"Candi Borobudur di Magelang, Yogyakarta adalah...",1,-,-,1,-,-
3,4.0,Candi Borobudur,"Baru pertama kali kesini, pas sih kalau tempat...",1,-,-,-,-1,-
4,5.0,Candi Borobudur,"candi borobudur, tempat wisata ini sudah terke...",1,1,-,1,-,-


In [4]:
def preprocess(text):
    # Text normalization
    contractions = {
        "isn't": "is not",
        "aren't": "are not",
        "wasn't": "was not",
        "weren't": "were not",
        "haven't": "have not",
        "hasn't": "has not",
        "hadn't": "had not",
        "won't": "will not",
        "wouldn't": "would not",
        "don't": "do not",
        "doesn't": "does not",
        "didn't": "did not",
        "can't": "can not",
        "couldn't": "could not",
        "shouldn't": "should not",
        "mightn't": "might not",
        "mustn't": "must not",
        "I'm": "I am",
        "You're": "you are",
        "I've": "I have",
        "UI": "user interface",
        "UX": "user experience",
        "u": "you",
    }
    pattern = re.compile(r'\b(' + '|'.join(contractions.keys()) + r')\b')
    expanded_text = pattern.sub(lambda match: contractions[match.group(0)], text)

    normalized_text = uni.normalize('NFKD', expanded_text)
    normalized_text = ''.join([c for c in normalized_text if not uni.combining(c)])

    # emoji encoding
    emojis = demoji.findall(text)

    for emoji in emojis:
        text = text.replace(emoji, " " + emojis[emoji].split(":")[0])

    # text preprocessing
    teks = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    teks = teks.lower()
    stop_words = set(stopwords.words('english'))
    stop_words.update(['also', 'app', 'apps', 'application', 'applications', 'good'])
    stop_words.remove('not')
    tokens = word_tokenize(teks)
    filtered_tokens = [word for word in tokens if word.isalnum() and not any(char.isdigit() for char in word) and word not in stop_words]
    lemmatizer = nltk.WordNetLemmatizer()
    lemma = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    clean_reviews = ' '.join(lemma)

    return clean_reviews

In [5]:
pre_sentfin = sentfin.Title.progress_apply(preprocess)

100%|██████████| 10753/10753 [00:06<00:00, 1633.27it/s]


In [6]:
pre_sentfin.head()

0                spicejet issue crore warrant promoter
1                                mmtc net loss r crore
2                  midcap fund deliver stay put expert
3                          mid cap turn market darling
4    market seeing patience not conviction prakash ...
Name: Title, dtype: object

In [7]:
import ast
# Mengonversi string JSON ke dictionary
sentfin['Decisions'] = sentfin['Decisions'].apply(ast.literal_eval)
# Mengonversi label sentimen ke bentuk yang bisa diproses
def convert_labels(decisions):
    for key, value in decisions.items():
        if value == 'positive':
            return 2
        elif value == 'neutral':
            return 1
        elif value == 'negative':
            return 0

sentfin['Label'] = sentfin['Decisions'].apply(convert_labels)
sentfin['Text'] = sentfin['Title']
sentfin = sentfin[['Text', 'Label']]

In [8]:
sentfin

Unnamed: 0,Text,Label
0,SpiceJet to issue 6.4 crore warrants to promoters,1
1,MMTC Q2 net loss at Rs 10.4 crore,1
2,"Mid-cap funds can deliver more, stay put: Experts",2
3,Mid caps now turn into market darlings,2
4,"Market seeing patience, if not conviction: Pra...",1
...,...,...
10748,"Negative on Chambal, Advanta: Mitesh Thacker",0
10749,"Small, Mid-cap stocks may emerge outperformers",2
10750,Rupee slips against US dollar,0
10751,Rupee weak against US dollar,0


In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from collections import Counter
import spacy


In [10]:

nlp = spacy.load("en_core_web_sm")

class TextDataset(Dataset):
    def __init__(self, texts, labels, vocab=None):
        self.texts = texts
        self.labels = labels
        self.tokenizer = nlp
        if vocab is None:
            self.vocab = self.build_vocab(self.texts)
        else:
            self.vocab = vocab

    def build_vocab(self, texts):
        tokens = [token.text for text in texts for token in self.tokenizer(text)]
        counter = Counter(tokens)
        vocab = {word: idx for idx, (word, _) in enumerate(counter.items(), 1)}
        vocab["<PAD>"] = 0
        return vocab

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        tokens = [self.vocab[token.text] for token in self.tokenizer(text)]
        return torch.tensor(tokens), torch.tensor(label)

# Split data
train_texts, test_texts, train_labels, test_labels = train_test_split(sentfin['Text'], sentfin['Label'], test_size=0.2)

# Membuat dataset dan dataloader
train_dataset = TextDataset(train_texts, train_labels)
test_dataset = TextDataset(test_texts, test_labels, vocab=train_dataset.vocab)
train_loader = DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=lambda x: x)
test_loader = DataLoader(test_dataset, batch_size=2, shuffle=False, collate_fn=lambda x: x)


In [15]:
import torch
import torch.nn as nn

class CNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        
        # Inisialisasi layer konvolusi dengan filter_sizes yang diberikan
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=n_filters, 
                      kernel_size=(fs, embed_dim)) 
            for fs in filter_sizes
        ])
        
        # Layer linear untuk output
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        # Dropout layer
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        # text shape: (batch_size, seq_length)
        
        # Embedding lookup
        embedded = self.embedding(text)  # shape: (batch_size, seq_length, embed_dim)
        # Add one channel to match expected input shape for Conv2d
        embedded = embedded.unsqueeze(1)  # shape: (batch_size, 1, seq_length, embed_dim)
        # Apply convolutional and pooling layers
        conved = [nn.functional.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        
        # Max pooling over time
        pooled = [nn.functional.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        # Concatenate pooled layers (channels)
        cat = self.dropout(torch.cat(pooled, dim=1))  # shape: (batch_size, len(filter_sizes) * n_filters)
        
        # Fully connected layer for classification
        logits = self.fc(cat)  # shape: (batch_size, output_dim)
        
        return logits


In [12]:
import torch.nn as nn

class CNN(nn.Module):
    def __init__(self, vocab_size, embed_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embed_dim)) 
            for fs in filter_sizes
        ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        text = text.permute(1, 0)
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        conved = [nn.functional.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [nn.functional.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.fc(cat)

In [16]:

# Hyperparameters
VOCAB_SIZE = len(train_dataset.vocab)
EMBED_DIM = 100
N_FILTERS = 100
FILTER_SIZES = [2, 3, 4]
OUTPUT_DIM = 3  # 3 kelas: positive, neutral, negative
DROPOUT = 0.5

model = CNN(VOCAB_SIZE, EMBED_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)

In [None]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

# Menggunakan GPU jika tersedia
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)

# Fungsi untuk menghitung akurasi
def binary_accuracy(preds, y):
    _, predicted = torch.max(preds, 1)
    correct = (predicted == y).float()
    return correct.sum() / len(correct)

# Fungsi untuk pad sequence dalam batch
def pad_collate(batch):
    texts, labels = zip(*batch)
    lengths = torch.tensor([len(text) for text in texts])
    texts = torch.nn.utils.rnn.pad_sequence(texts, padding_value=0, batch_first=True)
    labels = torch.tensor(labels)
    return texts, labels, lengths

# Loop Training
N_EPOCHS = 10

for epoch in range(N_EPOCHS):
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in DataLoader(train_dataset, batch_size=2, shuffle=True, collate_fn=pad_collate):
        texts, labels, lengths = batch
        texts, labels = texts.to(device), labels.to(device)
        
        optimizer.zero_grad()
        predictions = model(texts).squeeze(1)
        loss = criterion(predictions, labels.long())
        acc = binary_accuracy(predictions, labels.long())
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    
    print(f'Epoch: {epoch+1}, Loss: {epoch_loss/len(train_loader)}, Acc: {epoch_acc/len(train_loader)}')

In [18]:
model.eval()
test_loss = 0
test_acc = 0

with torch.no_grad():
    for batch in DataLoader(test_dataset, batch_size=2, collate_fn=pad_collate):
        texts, labels, lengths = batch
        texts, labels = texts.to(device), labels.to(device)
        
        predictions = model(texts).squeeze(1)
        loss = criterion(predictions, labels.long())
        acc = binary_accuracy(predictions, labels.long())
        
        test_loss += loss.item()
        test_acc += acc.item()
        
print(f'Test Loss: {test_loss/len(test_loader)}, Test Acc: {test_acc/len(test_loader)}')

KeyError: '10.4'