Importing Libraries

In [5]:
import numpy as np
import pandas as pd
import re
import nltk
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.datasets import load_files
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from google.colab import drive
from sklearn.feature_extraction.text import TfidfVectorizer
from torch.utils.data import Dataset, DataLoader, random_split

drive.mount('/content/gdrive')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Mounted at /content/gdrive


Import data

In [None]:
file_path = r"/content/gdrive/MyDrive/Lomba Satria Data/dataset_penyisihan_bdc_2024.xlsx"

# Load the Excel file into a pandas DataFrame
data = pd.read_excel(file_path)

# Assuming the Excel file has columns named 'text' and 'label'
X = data['text']
y = data['label']
print(data.head())

                                                text             label
0  Kunjungan Prabowo ini untuk meresmikan dan men...  Sumber Daya Alam
1  RT Anies dapat tepuk tangan meriah saat jadi R...           Politik
2  @CIqXqwGAT04tMtx4OCATxjoVq7vv/Y8HeYaIOgMFg8Y= ...         Demografi
3  RT @L3R8XFBw3WGbxRPSj0/0hHZTbqVGX7qtfwRg9zmhK7...           Politik
4  Anies Baswedan Harap ASN termasuk TNI dan Polr...           Politik


# **Pembersihan data**

In [None]:
stemmer = WordNetLemmatizer()
lemmatizer = WordNetLemmatizer()
documents = []

for sen in range(0, len(X)):
    # Convert to string
    document = str(X[sen])

    # Remove URLs
    document = re.sub(r'http\S+|www\S+|https\S+', '', document, flags=re.MULTILINE)

    # Remove sequences of characters that don't form words (random strings)
    document = re.sub(r'\b\w{10,}\b', '', document)

    # Remove specific string "[RE coldthem]"
    document = re.sub(r'\[RE coldthem\]', '', document)

    # Remove all the special characters
    document = re.sub(r'\W', ' ', document)

    # Remove all single characters
    document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

    # Remove single characters from the start
    document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

    # Substitute multiple spaces with a single space
    document = re.sub(r'\s+', ' ', document, flags=re.I)

    # Remove the word "RT"
    document = re.sub(r'\bRT\b', '', document)

    # Remove prefixed 'b'
    document = re.sub(r'^b\s+', '', document)

    document = re.sub(r'(\d+)([a-zA-Z]+)(\d+)', lambda m: m.group(2), document)

    document = re.sub(r'[^A-Za-z0-9\s]', '', document)

    document = re.sub(r'\[.*?\]', '', document)

    document = re.sub(r'02', 'prabowo', document)

    document = re.sub(r'01', 'anies', document)

    document = re.sub(r'02', 'ganjar', document)


    wordsToRemove = ['dan', 'serta', 'lagipula', 'setelah', 'sejak', 'selanjutnya', 'tetapi', 'melainkan',
                    'sedangkan', 'atau', 'ataupun', 'maupun', 'untuk', 'agar', 'supaya', 'sebab', 'karena',
                    'sehingga', 'sampai', 'akibatnya', 'lalu', 'kemudian', 'jika', 'kalau', 'jikalau', 'apabila',
                    'walaupun', 'maupun', 'meskipoun', 'biarpun', 'seperti', 'sebagai', 'bagaikan', 'biar',
                    'biarpun', 'bahkan', 'yaitu', 'yakni', 'kecuali', 'selain', 'goblok', 'tolol', 'jancuk',
                    'rt', 'tengil', 're', 'detikbali', 'fuck', 'lo', 'lu', 'letoy', 'cengeng', 'n', 'o', 'pantat',
                    'gue',]

    document = ' '.join(word for word in document.split() if word not in wordsToRemove)

    # Convert to lowercase
    document = document.lower()

    # Lemmatization
    document = document.split()
    document = [stemmer.lemmatize(word) for word in document]
    document = ' '.join(document)
    document = ' '.join(word for word in document.split() if word not in wordsToRemove)

    documents.append(document)

# Display the first few processed documents to verify
for i in range(10):  # Change the number as needed to view more or fewer documents
    print(f"Document {i+1}: {documents[i]}")

Document 1: kunjungan prabowo ini proyek bantuan air bersih di lima titik prabowo subianto
Document 2: anies dapat tepuk tangan meriah saat jadi rektor mata kuliah anti korupsi memutus mata rantai korupsi ekowboy2
Document 3: emng bener sih pendukung anies ada yg begitu jg dg pendukung prabowo hnya sj menurut pak ridwan kamil skemanya terbalik klo anies mayoritas menengah atas artinya ada jg rendah yg milih
Document 4: sewaktu anies bersikap kriti ke kinerja pak prabowo dianggap engga sopan dianggap kurang orang tua giliran skrg gibran yg sok kriti malah dianggap kriti kera apakah ini tidak standar ganda
Document 5: anies baswedan harap asn termasuk tni polri pegang sumpahnya dalam pemilu
Document 6: duh jangan pak lurah denger nih di acara hajatan rakyat puluhan ribu warga di kendal serukan ganjar kehadiran jdlc menjadi magnet bagi puluhan ribu warga datang hajatan rakyat ganjar mahfud besarnya warga menjadi bukti bahwa jawa tengah tetap menjadi kandang banteng dsyantie
Document 7: pr

Vectorization


In [6]:
# Convert text data to numerical data using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(documents).toarray()

# Convert labels to numerical values
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)

Pytorch data preparation

In [7]:
class TextDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = torch.tensor(texts, dtype=torch.float32)
        self.labels = torch.tensor(labels, dtype=torch.long)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

# Create dataset
dataset = TextDataset(X_tfidf, y_encoded)

# Split into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

Define the Neural Network Model

In [8]:
class TextClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(TextClassifier, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Set input dimension (number of features from TF-IDF), hidden dimension, and output dimension (number of classes)
input_dim = X_tfidf.shape[1]
hidden_dim = 100
output_dim = len(encoder.classes_)

# Initialize model, loss function, and optimizer
model = TextClassifier(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

Model Training

In [9]:
num_epochs = 10

for epoch in range(num_epochs):
    # Training loop
    model.train()
    for texts, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validation loop
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for texts, labels in val_loader:
            outputs = model(texts)
            loss = criterion(outputs, labels)
            val_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    val_loss /= len(val_loader)
    accuracy = correct / total

    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {val_loss:.4f}, Accuracy: {accuracy:.4f}')

Epoch 1/10, Loss: 1.2104, Accuracy: 0.6080
Epoch 2/10, Loss: 0.9123, Accuracy: 0.7220
Epoch 3/10, Loss: 0.7940, Accuracy: 0.7550
Epoch 4/10, Loss: 0.7742, Accuracy: 0.7630
Epoch 5/10, Loss: 0.7806, Accuracy: 0.7620
Epoch 6/10, Loss: 0.7980, Accuracy: 0.7590
Epoch 7/10, Loss: 0.8360, Accuracy: 0.7600
Epoch 8/10, Loss: 0.8662, Accuracy: 0.7520
Epoch 9/10, Loss: 0.8981, Accuracy: 0.7540
Epoch 10/10, Loss: 0.9295, Accuracy: 0.7450


Model Eval

In [10]:
model.eval()
all_labels = []
all_predictions = []
with torch.no_grad():
    for texts, labels in val_loader:
        outputs = model(texts)
        _, predicted = torch.max(outputs, 1)
        all_labels.extend(labels.tolist())
        all_predictions.extend(predicted.tolist())

from sklearn.metrics import classification_report
print(classification_report(all_labels, all_predictions, target_names=encoder.classes_))

                         precision    recall  f1-score   support

              Demografi       0.67      0.17      0.27        12
                Ekonomi       0.72      0.82      0.77        60
               Geografi       0.00      0.00      0.00         5
               Ideologi       0.62      0.61      0.61        79
Pertahanan dan Keamanan       0.72      0.63      0.67        70
                Politik       0.79      0.87      0.83       608
          Sosial Budaya       0.59      0.45      0.51       122
       Sumber Daya Alam       0.59      0.43      0.50        44

               accuracy                           0.74      1000
              macro avg       0.59      0.50      0.52      1000
           weighted avg       0.73      0.74      0.73      1000



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
FInal

In [11]:
# Ensure the model is in evaluation mode
model.eval()

# Predict labels for the entire dataset
all_predictions = []
with torch.no_grad():
    for texts in torch.tensor(X_tfidf, dtype=torch.float32):
        outputs = model(texts.unsqueeze(0))
        _, predicted = torch.max(outputs, 1)
        all_predictions.append(predicted.item())

# Add predictions to the DataFrame
data['predicted_label'] = encoder.inverse_transform(all_predictions)


In [12]:
from sklearn.metrics import accuracy_score

# Calculate accuracy
accuracy = accuracy_score(y_encoded, all_predictions)
print(f'Overall Accuracy: {accuracy:.4f}')

Overall Accuracy: 0.9312


In [15]:
output_file_path = '/content/gdrive/MyDrive/outputtrainingfile.xlsx'
data.to_excel(output_file_path, index=False)
print(f"Updated dataset with predictions saved to {output_file_path}")

Updated dataset with predictions saved to /content/gdrive/MyDrive/outputtrainingfile.xlsx
