# EDA (Exploratory Data Analysis)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install tabulate



In [None]:
!pip install emoji
!pip install pandas
!pip install nltk
!pip install sastrawi

Collecting emoji
  Downloading emoji-2.14.0-py3-none-any.whl.metadata (5.7 kB)
Downloading emoji-2.14.0-py3-none-any.whl (586 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/586.9 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━[0m [32m368.6/586.9 kB[0m [31m11.2 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m586.9/586.9 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.14.0
Collecting sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl.metadata (909 bytes)
Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sastrawi
Successfully installed sastrawi-1.0.1


1. Identifikasi Data

In [None]:
# Import libraries
import pandas as pd
from tabulate import tabulate

# Load dataset
file_path = "/content/drive/MyDrive/DST/Copy of Salinan cnn_dataset_with_label.xlsx"
df = pd.read_excel(file_path)

# Basic information
def basic_info(df):
    return {
        "Total Rows": len(df),
        "Total Columns": len(df.columns),
        "Null Values": df.isnull().sum().sum(),
        "Duplicate Rows": df.duplicated().sum()
    }

# Data types and missing values
def data_summary(df):
    summary = pd.DataFrame({
        "Column": df.columns,
        "Data Type": df.dtypes,
        "Non-Null Count": df.count(),
        "Missing Count": df.isnull().sum(),
        "Unique Count": df.nunique(),
    }).reset_index(drop=True)
    return summary

# Label distribution
def label_distribution(df, label):
    if label in df.columns:
        distribution = df[label].value_counts().reset_index()
        distribution.columns = ["Label", "Count"]
        return distribution
    else:
        return pd.DataFrame({"Error": ["Label column not found"]})

# Generate summary tables
basic_stats = basic_info(df)
data_overview = data_summary(df)
label_dist = label_distribution(df, "label")

# Display results as tables
print("Basic Dataset Information:\n")
print(tabulate([basic_stats.items()], headers=["Metric", "Value"], tablefmt="pretty"))

print("\nColumn-wise Summary:\n")
print(tabulate(data_overview, headers="keys", tablefmt="pretty"))

print("\nLabel Distribution:\n")
print(tabulate(label_dist, headers="keys", tablefmt="pretty"))

Basic Dataset Information:

+-----------------------+----------------------+--------------------+-----------------------+
|                       |                      |       Metric       |         Value         |
+-----------------------+----------------------+--------------------+-----------------------+
| ('Total Rows', 12227) | ('Total Columns', 4) | ('Null Values', 0) | ('Duplicate Rows', 2) |
+-----------------------+----------------------+--------------------+-----------------------+

Column-wise Summary:

+---+---------+-----------+----------------+---------------+--------------+
|   | Column  | Data Type | Non-Null Count | Missing Count | Unique Count |
+---+---------+-----------+----------------+---------------+--------------+
| 0 |  title  |  object   |     12227      |       0       |    11972     |
| 1 | content |  object   |     12227      |       0       |    11869     |
| 2 |   url   |  object   |     12227      |       0       |    12225     |
| 3 |  label  |  object

# Preprocesing

1. Menghapus Atribut yang Tidak Berguna


In [None]:
import pandas as pd
from tabulate import tabulate

# Penghapusan Atribut yang Tidak Berguna
# Drop 'url' column
df = df.drop(columns=['url'], errors='ignore')

# Function to truncate text for better readability
def truncate_text(text, max_length=50):
    return text if isinstance(text, str) and len(text) <= max_length else str(text)[:max_length] + "..."

# Apply truncation to 'content' column for display
df_display = df.head(5).copy()
df_display['title'] = df_display['title'].apply(lambda x: truncate_text(x, max_length=50))
df_display['content'] = df_display['content'].apply(lambda x: truncate_text(x, max_length=100))

print("\nUpdated Dataset after Dropping 'url' :\n")
print(tabulate(df_display[['title', 'content', 'label']].head(5), headers='keys', tablefmt='pretty'))

# Simpan hasil untuk langkah selanjutnya
df_step_1 = df.copy()


Updated Dataset after Dropping 'url' :

+---+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------+-----------+
|   |                         title                         |                                                 content                                                 |   label   |
+---+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------+-----------+
| 0 | AS Paksa Google Jual Chrome hingga Lepas Android, ... | Jakarta, CNN Indonesia Departemen Kehakiman AS (DOJ) dilaporkan tengah menyiapkan langkah hukum untu... | teknologi |
| 1 | Apple Mau Tambah Investasi Rp1,5 Triliun Demi Jual... | Jakarta, CNN Indonesia Appledikabarkan meningkatkan penawarannya untuk berinvestasi di Indonesia hin... | teknologi |
| 2 | Aturan Pajak Karbon, RI Berpotensi Kehilangan Pasa...

2. Menghapus Judul dengn Pola yang Tidak di Inginkan

In [None]:
# Define unwanted title patterns
unwanted_patterns = ['VIDEO:', 'FOTO:', 'No Title', 'video:', 'foto:', 'no title']

# Filter rows with unwanted patterns
rows_with_unwanted_patterns = df_step_1[df_step_1['title'].str.startswith(tuple(unwanted_patterns), na=False)]

# Display rows with unwanted patterns (sebelum penghapusan)
print("\nRows with Unwanted Patterns (Before Removal):\n")
rows_with_unwanted_patterns_display = rows_with_unwanted_patterns.copy()
rows_with_unwanted_patterns_display['title'] = rows_with_unwanted_patterns_display['title'].apply(lambda x: truncate_text(x, max_length=50))
rows_with_unwanted_patterns_display['content'] = rows_with_unwanted_patterns_display['content'].apply(lambda x: truncate_text(x, max_length=100))
print(tabulate(rows_with_unwanted_patterns_display[['title', 'content', 'label']].head(5), headers="keys", tablefmt="pretty"))

# Drop rows where 'title' starts with unwanted patterns
df_step_2 = df_step_1[~df_step_1['title'].str.startswith(tuple(unwanted_patterns), na=False)]

# Display the cleaned dataset (setelah penghapusan)
print("\nDataset After Removing Unwanted Rows:\n")
df_cleaned_display = df_step_2.copy()
df_cleaned_display['title'] = df_cleaned_display['title'].apply(lambda x: truncate_text(x, max_length=50))
df_cleaned_display['content'] = df_cleaned_display['content'].apply(lambda x: truncate_text(x, max_length=100))
print(tabulate(df_cleaned_display[['title', 'content', 'label']].head(5), headers="keys", tablefmt="pretty"))



Rows with Unwanted Patterns (Before Removal):

+----+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------+-----------+
|    |                         title                         |                                                 content                                                 |   label   |
+----+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------+-----------+
| 5  | FOTO: Rudal-Drone Iran Mejeng Saat Konflik Timteng... |                                         Jakarta, CNN Indonesia                                          | teknologi |
| 8  | FOTO: Mengintip Pabrik Pembuat Teknologi Anti-Dron... |                                         Jakarta, CNN Indonesia                                          | teknologi |
| 9  | VIDEO: SpaceX Uji Terbang Starship Disak

3. Menghapus Baris Pertama Berita

In [None]:
# Fungsi untuk menghapus kalimat pertama dalam paragraf
def remove_first_sentence(paragraph):
    if isinstance(paragraph, str):
        sentences = paragraph.split('. ')
        if len(sentences) > 1:
            return '. '.join(sentences[1:])  # Menggabungkan kalimat setelah kalimat pertama
        else:
            return ""  # Jika hanya ada satu kalimat, return string kosong
    return paragraph

# Menampilkan 5 data sebelum perubahan
df_display = df_step_2.head(5).copy()
df_display['title'] = df_display['title'].apply(lambda x: truncate_text(x, max_length=50))
df_display['content'] = df_display['content'].apply(lambda x: truncate_text(x, max_length=100))

print("Before Removing First Sentence:")
print(tabulate(df_display, headers="keys", tablefmt="pretty"))

# Hapus kalimat pertama di kolom 'content'
df_step_3 = df_step_2.copy()
df_step_3['content'] = df_step_3['content'].apply(remove_first_sentence)

# Tampilkan 5 data setelah penghapusan kalimat pertama
df_display = df_step_3.head(5).copy()
df_display['title'] = df_display['title'].apply(lambda x: truncate_text(x, max_length=50))
df_display['content'] = df_display['content'].apply(lambda x: truncate_text(x, max_length=100))

print("\nAfter Removing First Sentence:")
print(tabulate(df_display, headers="keys", tablefmt="pretty"))



Before Removing First Sentence:
+---+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------+-----------+
|   |                         title                         |                                                 content                                                 |   label   |
+---+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------+-----------+
| 0 | AS Paksa Google Jual Chrome hingga Lepas Android, ... | Jakarta, CNN Indonesia Departemen Kehakiman AS (DOJ) dilaporkan tengah menyiapkan langkah hukum untu... | teknologi |
| 1 | Apple Mau Tambah Investasi Rp1,5 Triliun Demi Jual... | Jakarta, CNN Indonesia Appledikabarkan meningkatkan penawarannya untuk berinvestasi di Indonesia hin... | teknologi |
| 2 | Aturan Pajak Karbon, RI Berpotensi Kehilangan Pasa... | Baku, 

4. Menghapus Iklan dalam Content

In [None]:
# Menampilkan 5 data yang mengandung "ADVERTISEMENT SCROLL TO CONTINUE WITH CONTENT" sebelum penghapusan
df_with_ads = df_step_3[df_step_3['content'].str.contains("ADVERTISEMENT SCROLL TO CONTINUE WITH CONTENT", na=False)]
df_display_before = df_with_ads.head(5).copy()  # Membatasi hanya 5 baris pertama
df_display_before['content'] = df_display_before['content'].apply(lambda x: x[:150] + "..." if len(x) > 150 else x)

print("\nContent Before Removing Advertisement:")
print(tabulate(df_display_before, headers="keys", tablefmt="pretty"))

# Fungsi untuk menghapus teks iklan
def remove_advertisement(text):
    if isinstance(text, str):
        return text.replace("ADVERTISEMENT SCROLL TO CONTINUE WITH CONTENT", "").strip()
    return text

# Menghapus teks iklan di kolom 'content'
df_step_4 = df_step_3.copy()
df_step_4['content'] = df_step_4['content'].apply(remove_advertisement)

# Menampilkan 5 data yang sama setelah penghapusan iklan
df_display_after = df_step_4.loc[df_with_ads.head(5).index].copy()  # Membatasi hanya 5 baris pertama
df_display_after['content'] = df_display_after['content'].apply(lambda x: x[:150] + "..." if len(x) > 150 else x)

print("\nContent After Removing Advertisement:")
print(tabulate(df_display_after, headers="keys", tablefmt="pretty"))



Content Before Removing Advertisement:
+---+----------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+
|   |                                title                                 |                                                                          content                                                                          |   label   |
+---+----------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------+-----------+
| 0 |   AS Paksa Google Jual Chrome hingga Lepas Android, Apa Sebabnya?    | Google Chrome Hal ini menyusul keputusan hakim yang menyatakan bahwa Google telah melakukan monopoli bisnis secara ilegal. Chrome, yang saat ini men

5. Menghilangkan Noise di Akhir Content

In [None]:
import re
from tabulate import tabulate

# Fungsi untuk mengekstrak kalimat terakhir
def extract_last_sentence(text):
    if isinstance(text, str):
        sentences = re.split(r'(?<=\.)\s', text.strip())  # Memisahkan kalimat berdasarkan titik
        if sentences:
            return sentences[-1]  # Mengembalikan kalimat terakhir
    return text  # Jika bukan string, kembalikan teks asli

# Fungsi untuk menghapus noise setelah kalimat terakhir
def remove_noise_after_last_sentence(paragraph):
    if isinstance(paragraph, str):
        # Mencari posisi titik terakhir
        last_period_index = paragraph.rfind('.')

        if last_period_index != -1:
            # Mengambil hanya kalimat sebelum titik terakhir
            cleaned_paragraph = paragraph[:last_period_index + 1]

            # Menghapus noise dalam tanda kurung atau simbol lainnya setelah titik terakhir
            cleaned_paragraph = re.sub(r'\[.*?\]|\(.*?\)', '', cleaned_paragraph)

            return cleaned_paragraph.strip()  # Menghapus spasi tambahan
        else:
            return paragraph.strip()  # Jika tidak ada titik, return teks asli
    return paragraph

# Menampilkan 5 data yang mengandung noise di akhir
df_with_noise = df_step_4[df_step_4['content'].str.contains(r'\[.*?\]|\(.*?\)', na=False)].head(5)
df_display_before_noise = df_with_noise.copy()

# Menampilkan kalimat terakhir sebelum menghapus noise
df_display_before_noise['last_sentence_before'] = df_display_before_noise['content'].apply(extract_last_sentence)

# Menampilkan data sebelum penghapusan noise
print("\nContent Before Removing Noise:")
print(tabulate(df_display_before_noise[['last_sentence_before']], headers="keys", tablefmt="pretty"))

# Menghapus noise setelah kalimat terakhir di kolom 'content'
df_step_5 = df_step_4.copy()
df_step_5['content'] = df_step_5['content'].apply(remove_noise_after_last_sentence)

# Menampilkan 5 data yang sama setelah penghapusan noise
df_display_after_noise = df_step_5.loc[df_with_noise.index].copy()

# Menampilkan kalimat terakhir setelah menghapus noise
df_display_after_noise['last_sentence_after'] = df_display_after_noise['content'].apply(extract_last_sentence)

# Menampilkan data setelah penghapusan noise
print("\nContent After Removing Noise:")
print(tabulate(df_display_after_noise[['last_sentence_after']], headers="keys", tablefmt="pretty"))



Content Before Removing Noise:
+---+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|   |                                                                                                                               last_sentence_before                                                                                                                                |
+---+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 0 | DOJ mengatakan mereka "mempertimbangkan solusi perilaku dan struktural yang akan mencegah Google menggunakan produk 

6. Remove Punctuation

In [None]:
# Fungsi untuk menghapus tanda baca
def remove_punctuation(text):
    if isinstance(text, str):
        return re.sub(r'[^\w\s]', '', text)  # Menghapus semua tanda baca (selain huruf dan angka)
    return text

# Menampilkan 5 data sebelum penghapusan tanda baca
df_display_before = df_step_5.head(5).copy()  # Menggunakan df_step_5 untuk melanjutkan dari data yang sudah dibersihkan
df_display_before['title'] = df_display_before['title'].apply(lambda x: x[:50] + "..." if len(x) > 50 else x)
df_display_before['content'] = df_display_before['content'].apply(lambda x: x[:100] + "..." if len(x) > 100 else x)

print("\nContent Before Removing Punctuation:")
print(tabulate(df_display_before[['title', 'content', 'label']], headers="keys", tablefmt="pretty"))

# Menghapus tanda baca di kolom 'title' dan 'content'
df_step_6 = df_step_5.copy()
df_step_6['title'] = df_step_6['title'].apply(remove_punctuation)
df_step_6['content'] = df_step_6['content'].apply(remove_punctuation)

# Menampilkan 5 data setelah penghapusan tanda baca
df_display_after = df_step_6.head(5).copy()  # Membatasi hanya 5 data pertama setelah perubahan
df_display_after['title'] = df_display_after['title'].apply(lambda x: x[:50] + "..." if len(x) > 50 else x)
df_display_after['content'] = df_display_after['content'].apply(lambda x: x[:100] + "..." if len(x) > 100 else x)

print("\nContent After Removing Punctuation:")
print(tabulate(df_display_after[['title', 'content', 'label']], headers="keys", tablefmt="pretty"))


Content Before Removing Punctuation:
+---+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------+-----------+
|   |                         title                         |                                                 content                                                 |   label   |
+---+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------+-----------+
| 0 | AS Paksa Google Jual Chrome hingga Lepas Android, ... | Google Chrome Hal ini menyusul keputusan hakim yang menyatakan bahwa Google telah melakukan monopoli... | teknologi |
| 1 | Apple Mau Tambah Investasi Rp1,5 Triliun Demi Jual... | Apple Investasi ini disebut sebagai upaya terbaru raksasa teknologi AS ini untuk membujuk pemerintah... | teknologi |
| 2 | Aturan Pajak Karbon, RI Berpotensi Kehilangan Pasa... | 

7. Cleaning Text

In [None]:
import re

# Fungsi untuk membersihkan teks
def clean_text(text):
    if isinstance(text, str):
        # Hapus mention (@username)
        text = re.sub(r'@[A-Za-z0-9_]+', '', text)

        # Hapus URL (http, https, www)
        text = re.sub(r'http\S+|www\S+|https\S+', '', text)

        # Hapus hashtag (#hashtag)
        text = re.sub(r'#\S+', '', text)

        # Hapus newline (\n) dan menggantinya dengan spasi
        text = re.sub(r'\n', ' ', text)

        # Hapus angka
        text = re.sub(r'\d+', '', text)

        # Hapus emoji dan karakter non-ASCII
        text = text.encode('ascii', 'ignore').decode('ascii')

    return text

# Menampilkan data sebelum cleaning
df_display_before = df_step_6.head(5).copy()  # Membatasi hanya 5 data pertama
df_display_before['title'] = df_display_before['title'].apply(lambda x: x[:50] + "..." if len(x) > 50 else x)
df_display_before['content'] = df_display_before['content'].apply(lambda x: x[:100] + "..." if len(x) > 100 else x)

print("\nContent Before Cleaning:")
print(tabulate(df_display_before[['title', 'content', 'label']], headers="keys", tablefmt="pretty"))

# Mengaplikasikan fungsi clean_text ke kolom 'title' dan 'content'
df_step_7 = df_step_6.copy()
df_step_7['title'] = df_step_7['title'].apply(clean_text)
df_step_7['content'] = df_step_7['content'].apply(clean_text)

# Menampilkan data setelah cleaning
df_display_after = df_step_7.head(5).copy()  # Membatasi hanya 5 data pertama
df_display_after['title'] = df_display_after['title'].apply(lambda x: x[:50] + "..." if len(x) > 50 else x)
df_display_after['content'] = df_display_after['content'].apply(lambda x: x[:100] + "..." if len(x) > 100 else x)

print("\nContent After Cleaning:")
print(tabulate(df_display_after[['title', 'content', 'label']], headers="keys", tablefmt="pretty"))


Content Before Cleaning:
+---+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------+-----------+
|   |                         title                         |                                                 content                                                 |   label   |
+---+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------+-----------+
| 0 | AS Paksa Google Jual Chrome hingga Lepas Android A... | Google Chrome Hal ini menyusul keputusan hakim yang menyatakan bahwa Google telah melakukan monopoli... | teknologi |
| 1 | Apple Mau Tambah Investasi Rp15 Triliun Demi Jual ... | Apple Investasi ini disebut sebagai upaya terbaru raksasa teknologi AS ini untuk membujuk pemerintah... | teknologi |
| 2 | Aturan Pajak Karbon RI Berpotensi Kehilangan Pasar... | pajak karbon

8. Case Folding

In [None]:
import pandas as pd
from tabulate import tabulate

# Fungsi untuk melakukan case folding (mengubah teks menjadi huruf kecil)
def case_folding(text):
    if isinstance(text, str):
        return text.lower()  # Mengubah teks menjadi huruf kecil
    return text

# Menampilkan data sebelum case folding
df_display_before = df_step_7.head(5).copy()
df_display_before['title'] = df_display_before['title'].apply(lambda x: x[:50] + "..." if len(x) > 50 else x)
df_display_before['content'] = df_display_before['content'].apply(lambda x: x[:100] + "..." if len(x) > 100 else x)

print("\nContent Before Case Folding:")
print(tabulate(df_display_before[['title', 'content', 'label']], headers="keys", tablefmt="pretty"))

# Mengaplikasikan case folding ke kolom 'title' dan 'content'
df_step_8 = df_step_7.copy()
df_step_8['title'] = df_step_8['title'].apply(case_folding)
df_step_8['content'] = df_step_8['content'].apply(case_folding)

# Menampilkan data setelah case folding
df_display_after_case_folding = df_step_8.head(5).copy()
df_display_after_case_folding['title'] = df_display_after_case_folding['title'].apply(lambda x: x[:50] + "..." if len(x) > 50 else x)
df_display_after_case_folding['content'] = df_display_after_case_folding['content'].apply(lambda x: x[:100] + "..." if len(x) > 100 else x)

print("\nContent After Case Folding:")
print(tabulate(df_display_after_case_folding[['title', 'content', 'label']], headers="keys", tablefmt="pretty"))


Content Before Case Folding:
+---+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------+-----------+
|   |                         title                         |                                                 content                                                 |   label   |
+---+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------+-----------+
| 0 | AS Paksa Google Jual Chrome hingga Lepas Android A... | Google Chrome Hal ini menyusul keputusan hakim yang menyatakan bahwa Google telah melakukan monopoli... | teknologi |
| 1 | Apple Mau Tambah Investasi Rp Triliun Demi Jual iP... | Apple Investasi ini disebut sebagai upaya terbaru raksasa teknologi AS ini untuk membujuk pemerintah... | teknologi |
| 2 | Aturan Pajak Karbon RI Berpotensi Kehilangan Pasar... | pajak ka

9. Remove Duplicate

In [None]:
import pandas as pd
from tabulate import tabulate

# Menampilkan jumlah data sebelum menghapus duplikat
total_data_before = len(df_step_8)

# Menampilkan data yang duplikat berdasarkan 'title' atau 'content'
duplicates = df_step_8[df_step_8.duplicated(subset=['title'], keep=False) | df_step_8.duplicated(subset=['content'], keep=False)]

# Menampilkan hanya 5 data duplikat
print(f"\nData Duplicates Before Removing:")
print(tabulate(duplicates[['title', 'content']].head(5), headers="keys", tablefmt="pretty"))

# Menghapus duplikat berdasarkan 'title' atau 'content'
df_step_9 = df_step_8.drop_duplicates(subset=['title', 'content'], keep='first')  # Keep first to keep the first occurrence

# Menampilkan jumlah duplikat yang dihapus dan jumlah data yang tersisa
duplicates_removed = len(df_step_8) - len(df_step_9)
remaining_data = len(df_step_9)

print(f"\nNumber of Duplicates Removed: {duplicates_removed}")
print(f"Remaining Data after Removing Duplicates: {remaining_data}")

# Menampilkan data setelah penghapusan duplikat
df_display_after_duplicates = df_step_9.head(5).copy()  # Membatasi hanya 5 data pertama
df_display_after_duplicates['title'] = df_display_after_duplicates['title'].apply(lambda x: x[:50] + "..." if len(x) > 50 else x)
df_display_after_duplicates['content'] = df_display_after_duplicates['content'].apply(lambda x: x[:100] + "..." if len(x) > 100 else x)

print("\nContent After Removing Duplicates:")
print(tabulate(df_display_after_duplicates[['title', 'content', 'label']], headers="keys", tablefmt="pretty"))


Data Duplicates Before Removing:
+------+--------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

10. Stopword Removal

In [None]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
from tabulate import tabulate

# Fungsi untuk menghapus stopword
def stopword_removal(text, stop_words_remover):
    text = stop_words_remover.remove(text)
    return text

# Menampilkan 5 data pertama sebelum stopword untuk 'title' dan 'content'
df_display_before = df_step_9.head(5).copy()
df_display_before['title'] = df_display_before['title'].apply(lambda x: x[:50] + "..." if len(x) > 50 else x)
df_display_before['content'] = df_display_before['content'].apply(lambda x: x[:100] + "..." if len(x) > 100 else x)

print("Before Stopword Removal:")
print(tabulate(df_display_before[['title', 'content', 'label']], headers="keys", tablefmt="pretty"))

# Mendapatkan daftar stopword dari Sastrawi dan menambah daftar stopword custom jika ada
stopword_csv = pd.read_csv('/content/drive/MyDrive/DST/Copy of Salinan stopwordbahasa.csv', header=None)  # CSV stopword tambahan
additional_stopwords = stopword_csv[0].tolist()  # Mengonversi kolom CSV ke list
stop_words = StopWordRemoverFactory().get_stop_words()
new_stop_words = stop_words + additional_stopwords  # Menggabungkan stopword Sastrawi dan stopword tambahan

# Membuat ArrayDictionary untuk stopword
stop_words_dictionary = ArrayDictionary(new_stop_words)
stop_words_remover = StopWordRemover(stop_words_dictionary)

# Menerapkan stopword removal ke kolom 'title' dan 'content' langsung
df_step_9['title'] = df_step_9['title'].apply(lambda x: stopword_removal(str(x), stop_words_remover))
df_step_9['content'] = df_step_9['content'].apply(lambda x: stopword_removal(str(x), stop_words_remover))

# Menampilkan 5 data pertama setelah stopword untuk 'title' dan 'content'
df_display_after_stopword = df_step_9.head(5).copy()
df_display_after_stopword['title'] = df_display_after_stopword['title'].apply(lambda x: x[:50] + "..." if len(x) > 50 else x)
df_display_after_stopword['content'] = df_display_after_stopword['content'].apply(lambda x: x[:100] + "..." if len(x) > 100 else x)

print("\nAfter Stopword Removal:")
print(tabulate(df_display_after_stopword[['title', 'content', 'label']], headers="keys", tablefmt="pretty"))


Before Stopword Removal:
+---+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------+-----------+
|   |                         title                         |                                                 content                                                 |   label   |
+---+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------+-----------+
| 0 | as paksa google jual chrome hingga lepas android a... | google chrome hal ini menyusul keputusan hakim yang menyatakan bahwa google telah melakukan monopoli... | teknologi |
| 1 | apple mau tambah investasi rp triliun demi jual ip... | apple investasi ini disebut sebagai upaya terbaru raksasa teknologi as ini untuk membujuk pemerintah... | teknologi |
| 2 | aturan pajak karbon ri berpotensi kehilangan pasar... | pajak karbon 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_step_9['title'] = df_step_9['title'].apply(lambda x: stopword_removal(str(x), stop_words_remover))



After Stopword Removal:
+---+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------+-----------+
|   |                         title                         |                                                 content                                                 |   label   |
+---+-------------------------------------------------------+---------------------------------------------------------------------------------------------------------+-----------+
| 0 |  as paksa google jual chrome lepas android sebabnya   | google chrome menyusul keputusan hakim menyatakan google melakukan monopoli bisnis ilegal chrome saa... | teknologi |
| 1 |   apple tambah investasi rp triliun jual iphone  ri   | apple investasi upaya terbaru raksasa teknologi as membujuk pemerintah mencabut larangan penjualanip... | teknologi |
| 2 | aturan pajak karbon ri berpotensi kehilangan pasar... | pajak karbon 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_step_9['content'] = df_step_9['content'].apply(lambda x: stopword_removal(str(x), stop_words_remover))


In [None]:
# Menyimpan data setelah stopword removal ke file CSV
output_file = '/content/drive/MyDrive/DST/Hasil_Preprocesing_CNN.csv'
df_step_9.to_csv(output_file, index=False)

print(f"\nProcessed data saved to: {output_file}")


Processed data saved to: /content/drive/MyDrive/DST/Hasil_Preprocesing_CNN.csv


11. Balanced Data

In [None]:
import pandas as pd

# Membaca data dari Google Drive
file_path = '/content/drive/MyDrive/DST/Hasil_Preprocesing_CNN.csv'  # Ganti path dengan lokasi file Anda
df = pd.read_csv(file_path)

# Menampilkan jumlah data sebelum penghapusan
print("Jumlah data sebelum menghapus baris dengan content null:")
print(df.info())

Jumlah data sebelum menghapus baris dengan content null:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11474 entries, 0 to 11473
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    11474 non-null  object
 1   content  11471 non-null  object
 2   label    11474 non-null  object
dtypes: object(3)
memory usage: 269.0+ KB
None


In [None]:
# Menghapus baris yang memiliki nilai null pada kolom 'content'
df_cleaned = df.dropna(subset=['content'])

# Menampilkan jumlah data setelah penghapusan
print("\nJumlah data setelah menghapus baris dengan content null:")
print(df_cleaned.info())


Jumlah data setelah menghapus baris dengan content null:
<class 'pandas.core.frame.DataFrame'>
Index: 11471 entries, 0 to 11473
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    11471 non-null  object
 1   content  11471 non-null  object
 2   label    11471 non-null  object
dtypes: object(3)
memory usage: 358.5+ KB
None


In [None]:
# Fungsi untuk mengambil maksimal 1600 data per label
def sample_per_label(df, label_column, sample_size=1600):
    sampled_df = (
        df.groupby(label_column)
        .apply(lambda x: x.sample(n=min(len(x), sample_size), random_state=42))
    )
    return sampled_df.reset_index(drop=True)

# Mengambil maksimal 1600 data per label
df_sampled = sample_per_label(df_cleaned, label_column='label', sample_size=1600)

# Menampilkan distribusi data per label
print("\nDistribusi data setelah sampling:")
print(df_sampled['label'].value_counts())

# Menyimpan hasil ke file CSV
output_file = '/content/drive/MyDrive/DST/Balanced_Hasil_Preprocesing_CNN.csv'
df_sampled.to_csv(output_file, index=False)

print(f"\nData yang telah di-sampling disimpan di: {output_file}")

  .apply(lambda x: x.sample(n=min(len(x), sample_size), random_state=42))



Distribusi data setelah sampling:
label
ekonomi             1600
hiburan             1600
hukumdankriminal    1600
kesehatan           1600
politik             1600
teknologi           1600
Name: count, dtype: int64

Data yang telah di-sampling disimpan di: /content/drive/MyDrive/DST/Balanced_Hasil_Preprocesing_CNN.csv


In [None]:
import pandas as pd

# Membaca data dari Google Drive
file_path = '/content/drive/MyDrive/DST/Balanced_Hasil_Preprocesing_CNN.csv'  # Ganti path dengan lokasi file Anda
df = pd.read_csv(file_path)

# Menampilkan jumlah data sebelum penghapusan
print("Jumlah data sebelum menghapus baris dengan content null:")
print(df.info())

Jumlah data sebelum menghapus baris dengan content null:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9600 entries, 0 to 9599
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    9600 non-null   object
 1   content  9600 non-null   object
 2   label    9600 non-null   object
dtypes: object(3)
memory usage: 225.1+ KB
None
