In [None]:
import pandas as pd
import numpy as np

# Load the data
df = pd.read_excel('/content/datafix.xlsx')

# Print the dataframe
print(df)

# Display the distribution of the 'Label' column (0s and 1s)
label_distribution = df['Label'].value_counts()
print("\nDistribusi jumlah data pada kolom Label:")
print(label_distribution)


In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Define stopwords for Indonesian
stop_words = set(stopwords.words('indonesian'))

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to preprocess text with lemmatization and without tokenization
def preprocess_text(text):
    # 1. Remove unwanted characters (Regex)
    text = re.sub(r'\W', ' ', str(text))  # Keep only alphabetic characters
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space

    # 2. Case Folding: Convert text to lowercase
    text = text.lower()

    # 3. Remove stopwords (Filtering)
    filtered_words = [word for word in text.split() if word not in stop_words]

    # 4. Lemmatization (Apply lemmatizer)
    lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in filtered_words])

    # 5. Return the lemmatized and cleaned text
    return lemmatized_text

# Apply preprocessing to the 'Judul' and 'Konten' columns
X = df[['Judul', 'Konten']]  # Feature columns (both 'Judul' and 'Konten')
y = df['Label']  # Target column ('Label')

# Preprocess the 'Judul' and 'Konten' columns
X_preprocessed = X.apply(lambda col: col.apply(preprocess_text))

# Add the preprocessed columns to the DataFrame
df['Judul_preprocessed'] = X_preprocessed['Judul']
df['Konten_preprocessed'] = X_preprocessed['Konten']

# Show the first few rows of the DataFrame with preprocessed columns
print("\nPreview of DataFrame with Preprocessed Columns:")
print(df[['Judul', 'Judul_preprocessed', 'Konten', 'Konten_preprocessed']].head())


**Modelling**

**XGBoost**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.feature_extraction.text import TfidfVectorizer  # Import TfidfVectorizer for feature extraction
from stopwords import get_stopwords  # Library for stopwords

# Get the Indonesian stopwords from the stopwords library
indonesian_stopwords = get_stopwords('id')

# Assuming df already contains preprocessed 'Judul_preprocessed' and 'Konten_preprocessed' columns as raw text
df['text'] = df['Judul_preprocessed'] + " " + df['Konten_preprocessed']

# Tokenize and remove Indonesian stopwords using the stopwords list
def tokenize_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalnum() and word.lower() not in indonesian_stopwords]
    return tokens

# Apply the tokenizer to the 'text' column
df['tokens'] = df['text'].apply(tokenize_text)

# Join the tokenized text back into a string for TF-IDF
df['text_joined'] = df['tokens'].apply(lambda x: ' '.join(x))

# Splitting data into 60% train, 20% test, and 20% validation sets
train_texts, temp_texts, train_labels, temp_labels = train_test_split(df['text_joined'], df['Label'], test_size=0.4, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, random_state=42)

# Initialize TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words=indonesian_stopwords, max_features=5000)

# Fit and transform the training data
X_train = tfidf_vectorizer.fit_transform(train_texts).toarray()  # Convert to array (dense matrix)

# Transform the validation and test data
X_val = tfidf_vectorizer.transform(val_texts).toarray()
X_test = tfidf_vectorizer.transform(test_texts).toarray()

# Reshape the data to be suitable for LSTM (LSTM expects 3D input)
# Reshape to (samples, timesteps, features)
X_train = np.reshape(X_train, (X_train.shape[0], 1, X_train.shape[1]))
X_val = np.reshape(X_val, (X_val.shape[0], 1, X_val.shape[1]))
X_test = np.reshape(X_test, (X_test.shape[0], 1, X_test.shape[1]))

# Build the LSTM model
model = Sequential()
model.add(LSTM(100, input_shape=(X_train.shape[1], X_train.shape[2]), dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))  # For binary classification

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, train_labels, epochs=5, batch_size=64, validation_data=(X_val, val_labels), verbose=2)

# Evaluate the model on the validation dataset
val_predictions = model.predict(X_val)

# Flatten the predictions to 1D array for comparison
val_predictions = val_predictions.flatten()  # Convert from (n_samples, 1) to (n_samples,)

# Convert predictions to binary labels (True or False) based on threshold 0.5
val_predictions = (val_predictions > 0.5)

# Calculate accuracy for validation dataset
val_acc = np.mean(val_predictions == val_labels)
print(f"\nValidation Accuracy: {val_acc:.4f}")

# Calculate the confusion matrix for validation data
val_cm = confusion_matrix(val_labels, val_predictions)

# Plot the confusion matrix for validation data using Seaborn
plt.figure(figsize=(8,6))
sns.heatmap(val_cm, annot=True, fmt='g', cmap='Blues', xticklabels=['0', '1'], yticklabels=['0', '1'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Validation Confusion Matrix')
plt.show()

# Print classification report for validation data
print("\nValidation Classification Report:")
print(classification_report(val_labels, val_predictions, target_names=['0', '1']))

# Evaluate the model on the test dataset
test_predictions = model.predict(X_test)

# Flatten the predictions to 1D array for comparison
test_predictions = test_predictions.flatten()  # Convert from (n_samples, 1) to (n_samples,)

# Convert predictions to binary labels (True or False) based on threshold 0.5
test_predictions = (test_predictions > 0.5)

# Calculate accuracy for test dataset
test_acc = np.mean(test_predictions == test_labels)
print(f"\nTest Accuracy: {test_acc:.4f}")

# Calculate the confusion matrix for test data
test_cm = confusion_matrix(test_labels, test_predictions)

# Plot the confusion matrix for test data using Seaborn
plt.figure(figsize=(8,6))
sns.heatmap(test_cm, annot=True, fmt='g', cmap='Blues', xticklabels=['0', '1'], yticklabels=['0', '1'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Test Confusion Matrix')
plt.show()

# Print classification report for test data
print("\nTest Classification Report:")
print(classification_report(test_labels, test_predictions, target_names=['0', '1']))