In [47]:
import pandas as pd
import nltk
import demoji
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import unicodedata as uni
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from tqdm import tqdm
tqdm.pandas()

In [48]:
sentfin = pd.read_csv('dataset/SEntFiN-v1.1.csv', encoding="latin-1")
sentfin.head()

Unnamed: 0,S No.,Title,Decisions,Words
0,1,SpiceJet to issue 6.4 crore warrants to promoters,"{""SpiceJet"": ""neutral""}",8
1,2,MMTC Q2 net loss at Rs 10.4 crore,"{""MMTC"": ""neutral""}",8
2,3,"Mid-cap funds can deliver more, stay put: Experts","{""Mid-cap funds"": ""positive""}",8
3,4,Mid caps now turn into market darlings,"{""Mid caps"": ""positive""}",7
4,5,"Market seeing patience, if not conviction: Pra...","{""Market"": ""neutral""}",8


In [49]:
candi = pd.read_excel('dataset/reviews_borobudur_prambanan_TripAdvisor_GMaps_all_tesis.xlsx')
candi_clean = candi.dropna()
# print("\nDrop rows with any NaN values:")
candi_clean.head()

Unnamed: 0,id,lokasi,text,daya_tarik,amenitas,aksesibilitas,citra,harga,sdm
0,1.0,Candi Borobudur,peninggalan sejarah yang sudah berumur 1200 ta...,1,-,-,1,0,0
1,2.0,Candi Borobudur,Pertama kali bepergian selama masa pandemi. Ca...,-,1,-,1,-,1
2,3.0,Candi Borobudur,"Candi Borobudur di Magelang, Yogyakarta adalah...",1,-,-,1,-,-
3,4.0,Candi Borobudur,"Baru pertama kali kesini, pas sih kalau tempat...",1,-,-,-,-1,-
4,5.0,Candi Borobudur,"candi borobudur, tempat wisata ini sudah terke...",1,1,-,1,-,-


In [50]:
import ast
# Mengonversi string JSON ke dictionary
sentfin['Decisions'] = sentfin['Decisions'].apply(ast.literal_eval)
# Ekstrak teks dan label
sentfin['label'] = sentfin['Decisions'].apply(lambda x: list(x.values())[0])
sentfin['text'] = sentfin['Title']

# Encode label
le = LabelEncoder()
sentfin['label'] = le.fit_transform(sentfin['label'])


In [51]:
sentfin

Unnamed: 0,S No.,Title,Decisions,Words,label,text
0,1,SpiceJet to issue 6.4 crore warrants to promoters,{'SpiceJet': 'neutral'},8,1,SpiceJet to issue 6.4 crore warrants to promoters
1,2,MMTC Q2 net loss at Rs 10.4 crore,{'MMTC': 'neutral'},8,1,MMTC Q2 net loss at Rs 10.4 crore
2,3,"Mid-cap funds can deliver more, stay put: Experts",{'Mid-cap funds': 'positive'},8,2,"Mid-cap funds can deliver more, stay put: Experts"
3,4,Mid caps now turn into market darlings,{'Mid caps': 'positive'},7,2,Mid caps now turn into market darlings
4,5,"Market seeing patience, if not conviction: Pra...",{'Market': 'neutral'},8,1,"Market seeing patience, if not conviction: Pra..."
...,...,...,...,...,...,...
10748,10749,"Negative on Chambal, Advanta: Mitesh Thacker","{'Chambal': 'negative', 'Advanta': 'negative'}",6,0,"Negative on Chambal, Advanta: Mitesh Thacker"
10749,10750,"Small, Mid-cap stocks may emerge outperformers","{'Small': 'positive', 'Mid-cap stocks': 'posit...",6,2,"Small, Mid-cap stocks may emerge outperformers"
10750,10751,Rupee slips against US dollar,"{'Rupee': 'negative', 'US dollar': 'neutral'}",5,0,Rupee slips against US dollar
10751,10752,Rupee weak against US dollar,"{'Rupee': 'negative', 'US dollar': 'neutral'}",5,0,Rupee weak against US dollar


In [52]:
def preprocess(text):
    # Text normalization
    contractions = {
        "isn't": "is not",
        "aren't": "are not",
        "wasn't": "was not",
        "weren't": "were not",
        "haven't": "have not",
        "hasn't": "has not",
        "hadn't": "had not",
        "won't": "will not",
        "wouldn't": "would not",
        "don't": "do not",
        "doesn't": "does not",
        "didn't": "did not",
        "can't": "can not",
        "couldn't": "could not",
        "shouldn't": "should not",
        "mightn't": "might not",
        "mustn't": "must not",
        "I'm": "I am",
        "You're": "you are",
        "I've": "I have",
        "UI": "user interface",
        "UX": "user experience",
        "u": "you",
    }
    pattern = re.compile(r'\b(' + '|'.join(contractions.keys()) + r')\b')
    expanded_text = pattern.sub(lambda match: contractions[match.group(0)], text)

    normalized_text = uni.normalize('NFKD', expanded_text)
    normalized_text = ''.join([c for c in normalized_text if not uni.combining(c)])

    # emoji encoding
    emojis = demoji.findall(text)

    for emoji in emojis:
        text = text.replace(emoji, " " + emojis[emoji].split(":")[0])

    # text preprocessing
    teks = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    teks = teks.lower()
    stop_words = set(stopwords.words('english'))
    stop_words.update(['also', 'app', 'apps', 'application', 'applications', 'good'])
    stop_words.remove('not')
    tokens = word_tokenize(teks)
    filtered_tokens = [word for word in tokens if word.isalnum() and not any(char.isdigit() for char in word) and word not in stop_words]
    lemmatizer = nltk.WordNetLemmatizer()
    lemma = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    clean_reviews = ' '.join(lemma)

    return clean_reviews

In [53]:
sentfin.text = sentfin.Title.progress_apply(preprocess)

100%|██████████| 10753/10753 [00:05<00:00, 1923.37it/s]


In [54]:
sentfin.text.head()

0                spicejet issue crore warrant promoter
1                                mmtc net loss r crore
2                  midcap fund deliver stay put expert
3                          mid cap turn market darling
4    market seeing patience not conviction prakash ...
Name: text, dtype: object

In [55]:
sentfin

Unnamed: 0,S No.,Title,Decisions,Words,label,text
0,1,SpiceJet to issue 6.4 crore warrants to promoters,{'SpiceJet': 'neutral'},8,1,spicejet issue crore warrant promoter
1,2,MMTC Q2 net loss at Rs 10.4 crore,{'MMTC': 'neutral'},8,1,mmtc net loss r crore
2,3,"Mid-cap funds can deliver more, stay put: Experts",{'Mid-cap funds': 'positive'},8,2,midcap fund deliver stay put expert
3,4,Mid caps now turn into market darlings,{'Mid caps': 'positive'},7,2,mid cap turn market darling
4,5,"Market seeing patience, if not conviction: Pra...",{'Market': 'neutral'},8,1,market seeing patience not conviction prakash ...
...,...,...,...,...,...,...
10748,10749,"Negative on Chambal, Advanta: Mitesh Thacker","{'Chambal': 'negative', 'Advanta': 'negative'}",6,0,negative chambal advanta mitesh thacker
10749,10750,"Small, Mid-cap stocks may emerge outperformers","{'Small': 'positive', 'Mid-cap stocks': 'posit...",6,2,small midcap stock may emerge outperformers
10750,10751,Rupee slips against US dollar,"{'Rupee': 'negative', 'US dollar': 'neutral'}",5,0,rupee slip u dollar
10751,10752,Rupee weak against US dollar,"{'Rupee': 'negative', 'US dollar': 'neutral'}",5,0,rupee weak u dollar


In [62]:
# Contoh pendefinisian dataset untuk DataLoader
class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        title = self.data['Title'][idx]
        decision = self.data['label'][idx]
        # Lakukan proses tokenisasi dan encoding di sini
        
        return {
            'title': title,
            'decision': decision
        }

# Split dataset menjadi train dan validation
train_data, val_data = train_test_split(sentfin, test_size=0.2, random_state=42)

# Buat DataLoader untuk train dan validation
train_dataset = MyDataset(train_data)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

val_dataset = MyDataset(val_data)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

In [67]:
from torch.utils.data import DataLoader, Dataset
# Dataset class
class TextDataset(Dataset):
    def __init__(self, texts, labels, max_len=50):
        self.texts = texts
        self.labels = labels
        self.max_len = max_len
        self.vocab = self.build_vocab()

    def build_vocab(self):
        vocab = set()
        for text in self.texts:
            for word in text.split():
                vocab.add(word)
        word_to_idx = {word: idx + 1 for idx, word in enumerate(vocab)}  # +1 to reserve 0 for padding
        word_to_idx['<PAD>'] = 0
        return word_to_idx

    def text_to_sequence(self, text):
        sequence = [self.vocab.get(word, 0) for word in text.split()]
        if len(sequence) < self.max_len:
            sequence.extend([0] * (self.max_len - len(sequence)))
        else:
            sequence = sequence[:self.max_len]
        return sequence

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]
        sequence = self.text_to_sequence(text)
        return torch.tensor(sequence), torch.tensor(label, dtype=torch.long)

In [68]:
# Create DataLoaders
X = sentfin['text']
# X = tfidf.fit_transform(df_absa['Processed_Title']).toarray()
y = sentfin['label']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

train_dataset = TextDataset(X_train, y_train)
test_dataset = TextDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=4, shuffle=False)

In [69]:
# Contoh definisi model RNN sederhana
class RNNModel(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNNModel, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        out, _ = self.rnn(x)
        out = self.fc(out[:, -1, :])  # Mengambil output dari step terakhir
        return out

In [70]:
input_size = 100  # Misalnya, ukuran vektor kata setelah tokenisasi
hidden_size = 128
output_size = 3  # Misalnya, 3 kelas sentimen (negatif, netral, positif)

model = RNNModel(input_size, hidden_size, output_size)

# Definisikan loss dan optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [73]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7b1e64e2f110>

In [72]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        inputs = batch['text']  # Ambil judul atau fitur yang relevan
        labels = batch['label']  # Ambil label sentimen yang telah diencode
        
        # Lakukan tokenisasi dan encoding di sini jika diperlukan
        
        # Contoh: Ubah input menjadi tensor PyTorch
        inputs_tensor = torch.Tensor(inputs)  # Sesuaikan dengan tokenisasi Anda
        labels_tensor = torch.Tensor(labels)  # Sesuaikan dengan encoding Anda
        
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs_tensor)
        
        # Hitung loss
        loss = criterion(outputs, labels_tensor)
        
        # Backward pass dan optimasi
        loss.backward()
        optimizer.step()
        # Evaluasi model di setiap epoch
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            inputs = batch['title']
            labels = batch['decision']
            
            # Lakukan tokenisasi dan encoding di sini jika diperlukan
            
            inputs_tensor = torch.Tensor(inputs)  # Sesuaikan dengan tokenisasi Anda
            labels_tensor = torch.Tensor(labels)  # Sesuaikan dengan encoding Anda
            
            # Forward pass
            outputs = model(inputs_tensor)
            
            # Hitung loss
            loss = criterion(outputs, labels_tensor)
            val_loss += loss.item()
            
            # Hitung akurasi
            _, predicted = torch.max(outputs, 1)
            total += labels_tensor.size(0)
            correct += (predicted == labels_tensor).sum().item()
    
    epoch_val_loss = val_loss / len(val_loader)
    epoch_accuracy = correct / total
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {loss.item():.4f}, Val Loss: {epoch_val_loss:.4f}, Val Accuracy: {epoch_accuracy:.2f}')


TypeError: list indices must be integers or slices, not str