# **Tugas 8 - Klasifikasi Berita pada Data yang sudah di Reduksi**

Nama : Isnita Widyur Rahmah
NIM : 220411100048
Kelas : IF 7A

Link Project : https://github.com/nittyaa99/ppw

## Install Library

In [24]:
!pip install Sastrawi requests beautifulsoup4



## Import Library

In [25]:
import pandas as pd
import re
import requests

from tqdm import tqdm

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from urllib.request import urlopen
from bs4 import BeautifulSoup

from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import classification_report, confusion_matrix

import matplotlib.pyplot as plt
import seaborn as sns

import pickle

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Crawling Data Artikel Berita Menggunakan Python
melakukan crawling untuk mengambil judul, isi, tanggal, dan kategori artikel dari URL yang diberikan, dan menyajikan data tersebut dalam bentuk DataFrame untuk kemudahan analisis lebih lanjut

In [26]:
def crawl_article(url):
    try:
        response = requests.get(url)
        response.raise_for_status()  # Memastikan permintaan berhasil
        soup = BeautifulSoup(response.content, 'html.parser')

        # Mengambil judul
        title_element = soup.find('h1', class_='text-cnn_black')
        title = title_element.get_text().strip() if title_element else 'Judul tidak ditemukan'

        # Mengambil Isi
        content_div = soup.find('div', class_='detail-text')
        content = "\n".join([p.get_text().strip() for p in content_div.find_all('p')]) if content_div else 'Isi artikel tidak ditemukan'

        # Mengambil tanggal
        date_div = soup.find('div', class_='text-cnn_grey text-sm mb-4')
        date_text = date_div.text.strip() if date_div else 'Tanggal tidak ditemukan'

        # Mengambil kategori
        category_meta = soup.find("meta", attrs={'name': 'dtk:namakanal'})
        category = category_meta['content'].strip() if category_meta and 'content' in category_meta.attrs else 'Kategori tidak ditemukan'

        return {'Judul': title, 'Isi': content, 'Tanggal': date_text, 'Kategori': category}
    except requests.RequestException as e:
        print(f"Error fetching article: {e}")
        return None

article_url = input("Masukkan URL artikel: ")

# Melakukan crawl pada satu artikel
article = crawl_article(article_url)

df = pd.DataFrame([article])  # Membuat DataFrame dari dictionary
df

Masukkan URL artikel: https://www.cnnindonesia.com/ekonomi/20241107053635-92-1163815/ihsg-diprediksi-berbalik-menguat-hari-ini


Unnamed: 0,Judul,Isi,Tanggal,Kategori
0,IHSG Diprediksi Berbalik Menguat Hari Ini,Indeks Harga Saham Gabungan (IHSG) diperkiraka...,"Kamis, 07 Nov 2024 06:15 WIB",ekonomi


## Mengonversi Semua Huruf Besar Menjadi Huruf Kecil

In [27]:
def clean_lower(text):
    if isinstance(text, str):
        return text.lower()
    return text

df['lower case'] = df['Isi'].apply(clean_lower)
casefolding = pd.DataFrame(df['lower case'])

df['lower case']

Unnamed: 0,lower case
0,indeks harga saham gabungan (ihsg) diperkiraka...


## Menghapus Simbol dan Angka dari Teks

In [28]:
def clean_punct(text):
    if isinstance(text, str):
        clean_patterns = re.compile(r'[0-9]|[/(){}\[\]\|@,;_]|[^a-z ]')
        text = clean_patterns.sub(' ', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text
    return text

df['tanda baca'] = df['lower case'].apply(clean_punct)

df['tanda baca']

Unnamed: 0,tanda baca
0,indeks harga saham gabungan ihsg diperkirakan ...


## Menghapus Spasi Awal dan Akhir dari Sebuah String

In [29]:
def _normalize_whitespace(text):
    if isinstance(text, str):
        corrected = re.sub(r'\s+', ' ', text)
        return corrected.strip()
    return text

df['spasi'] = df['tanda baca'].apply(_normalize_whitespace)

## Mengurangi Jumlah Kata dalam Sebuah Dokumen

In [30]:
def clean_stopwords(text):
    if isinstance(text, str):
        stopword = set(stopwords.words('indonesian'))
        text = ' '.join(word for word in text.split() if word not in stopword)
        return text.strip()
    return text

df['stopwords'] = df['spasi'].apply(clean_stopwords)

## Mereduksi Kata Menjadi Bentuk Dasar

In [31]:
def sastrawistemmer(text):
    factory = StemmerFactory()
    st = factory.create_stemmer()
    text = ' '.join(st.stem(word) for word in tqdm(text.split()) if word in text)
    return text

df['stemming'] = df['stopwords'].apply(sastrawistemmer)

df['stemming']

100%|██████████| 121/121 [00:05<00:00, 22.15it/s]


Unnamed: 0,stemming
0,indeks harga saham gabung ihsg kuat dagang kam...


## Memuat Model TF-IDF Vectorizer yang Telah Disimpan

In [32]:
filename_tfidf = 'tfidf_vectorizer2.sav'
tfidf_vectorizer = pickle.load(open(filename_tfidf, 'rb'))

## Transformasi Teks ke Representasi TF-IDF

In [33]:
corpus = df['stemming'].tolist()

x_tfidf = tfidf_vectorizer.transform(corpus)
feature_names = tfidf_vectorizer.get_feature_names_out()

tfidf_df = pd.DataFrame(x_tfidf.toarray(), columns=feature_names)

cat_df = df["Kategori"]
tfidf_df['Kategori'] = cat_df.values
tfidf_df = tfidf_df[['Kategori'] + [col for col in tfidf_df.columns if col != 'Kategori']]

tfidf_df

Unnamed: 0,Kategori,abroad,absolut,acara,achmad,adab,adam,adaptif,adb,adi,...,yoppy,yuan,yudha,yuran,yusuf,zayana,zona,zonasi,zulhas,zulkifli
0,ekonomi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Reduksi Dimensi Menggunakan SVD

In [34]:
filename_svd = 'svd_model2.sav'
svd = pickle.load(open(filename_svd, 'rb'))

# Terapkan SVD pada data baru
x_new_lsa = svd.transform(x_tfidf)
train_lsa_df = pd.DataFrame(x_new_lsa, columns=[f'Component_{i+1}' for i in range(x_new_lsa.shape[1])])

train_lsa_df['Kategori'] = cat_df.values
train_lsa_df = train_lsa_df[['Kategori'] + [col for col in train_lsa_df.columns if col != 'Kategori']]

train_lsa_df

Unnamed: 0,Kategori,Component_1,Component_2,Component_3,Component_4,Component_5,Component_6,Component_7,Component_8,Component_9,...,Component_71,Component_72,Component_73,Component_74,Component_75,Component_76,Component_77,Component_78,Component_79,Component_80
0,ekonomi,0.056295,0.057171,0.024027,-0.008641,-0.044003,0.048446,0.047858,-0.088722,0.139452,...,-0.010515,-0.003002,0.005995,-0.014249,-0.005825,0.013415,0.02514,-0.005384,0.023052,0.013175


## Konversi Label Kategori ke Bentuk Numerik Menggunakan Label Encoder

In [35]:
label_encoder = preprocessing.LabelEncoder()
train_lsa_df['Kategori'] = label_encoder.fit_transform(train_lsa_df['Kategori'])

train_lsa_df

Unnamed: 0,Kategori,Component_1,Component_2,Component_3,Component_4,Component_5,Component_6,Component_7,Component_8,Component_9,...,Component_71,Component_72,Component_73,Component_74,Component_75,Component_76,Component_77,Component_78,Component_79,Component_80
0,0,0.056295,0.057171,0.024027,-0.008641,-0.044003,0.048446,0.047858,-0.088722,0.139452,...,-0.010515,-0.003002,0.005995,-0.014249,-0.005825,0.013415,0.02514,-0.005384,0.023052,0.013175


## Memuat Model Logistic Regression yang Telah Disimpan

In [36]:
filename = 'lr_model2.sav'
lr_model = pickle.load(open(filename, 'rb'))

## Prediksi Kategori Menggunakan Logistic Regression

In [37]:
y_test = train_lsa_df['Kategori']
x_test = train_lsa_df.drop(['Kategori'], axis=1)
y_pred = lr_model.predict(x_test)

y_pred



array([0])

## Evaluasi Prediksi: Perbandingan Nilai Aktual dan Prediksi

In [38]:
pred = pd.DataFrame({'Actual value': y_test, 'Predicted value':y_pred})
pred

Unnamed: 0,Actual value,Predicted value
0,0,0
