In [None]:
import re
from collections import Counter

# 1. Definisikan Variabel contoh_raw
# ==============================================================================
contoh_raw = """
Python is an interpreted high-level general-purpose programming language. Its design
philosophy emphasizes code readability with its use of significant indentation.
Its language constructs as well as its object-oriented approach aim to
help programmers write clear, logical code for small and large-scale projects
"""

# 2. Segmentasi Kalimat
# ==============================================================================
# Use regex to split into sentences (basic approach)
sentences = re.split(r'(?<=[.!?])\s+', contoh_raw.strip().replace('\n', ' '))
print(f"Jumlah Kalimat: {len(sentences)}")
print("-" * 50)

# Inisialisasi list untuk menyimpan matriks frekuensi
frequency_matrix = []

# Define a basic set of English stopwords
english_stopwords = set([
    'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd",
    'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers',
    'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
    'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been',
    'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if',
    'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between',
    'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out',
    'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
    'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not',
    'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should',
    "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't",
    'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't",
    'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't",
    'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"
])


# 3. & 4. Tokenisasi, Pembersihan, dan Perhitungan Frekuensi
# ==============================================================================
for i, sentence in enumerate(sentences):
    # a. Tokenisasi Kata (simple split by whitespace and punctuation)
    words = re.findall(r'\b\w+\b', sentence.lower())

    # b. Pembersihan: Menghapus stopwords
    cleaned_words = [word for word in words if word not in english_stopwords]

    # c. Menghitung frekuensi kata dalam kalimat
    word_freq = Counter(cleaned_words)
    frequency_matrix.append(word_freq)

    print(f"Kalimat {i+1} (Original): {sentence}")
    print(f"Kalimat {i+1} (Bersih): {cleaned_words}")
    print(f"Matriks Frekuensi Kalimat {i+1}: {dict(word_freq)}")
    print("-" * 50)

# 5. Hasil Akhir Matriks Frekuensi Kata-kata
# ==============================================================================
print("\n## Hasil Matriks Frekuensi Kata per Kalimat (setelah Stopword Removal)")
print("Matriks Frekuensi:")
for i, freq in enumerate(frequency_matrix):
    print(f"Kalimat {i+1}: {dict(freq)}")

Jumlah Kalimat: 3
--------------------------------------------------
Kalimat 1 (Original): Python is an interpreted high-level general-purpose programming language.
Kalimat 1 (Bersih): ['python', 'interpreted', 'high', 'level', 'general', 'purpose', 'programming', 'language']
Matriks Frekuensi Kalimat 1: {'python': 1, 'interpreted': 1, 'high': 1, 'level': 1, 'general': 1, 'purpose': 1, 'programming': 1, 'language': 1}
--------------------------------------------------
Kalimat 2 (Original): Its design philosophy emphasizes code readability with its use of significant indentation.
Kalimat 2 (Bersih): ['design', 'philosophy', 'emphasizes', 'code', 'readability', 'use', 'significant', 'indentation']
Matriks Frekuensi Kalimat 2: {'design': 1, 'philosophy': 1, 'emphasizes': 1, 'code': 1, 'readability': 1, 'use': 1, 'significant': 1, 'indentation': 1}
--------------------------------------------------
Kalimat 3 (Original): Its language constructs as well as its object-oriented approach aim to

TUGAS

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.corpus import stopwords
import nltk

# Unduh stopwords Bahasa Indonesia jika belum
try:
    nltk.data.find('corpora/stopwords')
except nltk.downloader.DownloadError:
    nltk.download('stopwords')

# Teks input
teks_raw = """
Dinas Kesehatan Kabupaten/Kota dan Puskesmas juga dapat membuat pos pelayanan
vaksinasi COVID-19. Dianjurkan agar setiap sasaran mencari informasi terlebih dahulu terkait
jadwal layanan masing-masing fasilitas pelayanan kesehatan atau pos pelayanan vaksinasi
"""

# 1. Segmentasi Kalimat (Dokumen)
# Setiap kalimat akan dianggap sebagai satu dokumen dalam korpus
korpus = [
    "Dinas Kesehatan Kabupaten/Kota dan Puskesmas juga dapat membuat pos pelayanan vaksinasi COVID-19.",
    "Dianjurkan agar setiap sasaran mencari informasi terlebih dahulu terkait jadwal layanan masing-masing fasilitas pelayanan kesehatan atau pos pelayanan vaksinasi"
]

# Dapatkan daftar stopwords Bahasa Indonesia
list_stopwords = stopwords.words('indonesian')

In [None]:
# Inisialisasi CountVectorizer dengan stopwords Bahasa Indonesia
cv = CountVectorizer(stop_words=list_stopwords)

# Terapkan CountVectorizer ke korpus (teks)
# Fiting dan transform: Membangun kosakata dan menghitung frekuensi
word_count_matrix = cv.fit_transform(korpus)

# Ambil nama-nama kata (fitur/term)
feature_names = cv.get_feature_names_out()

# Ubah matriks hasil menjadi DataFrame untuk tampilan yang lebih rapi
df_count = pd.DataFrame(word_count_matrix.toarray(),
                        index=[f"Kalimat {i+1}" for i in range(len(korpus))],
                        columns=feature_names)

print("Matriks Frekuensi Kata (Term Frequency Matrix):")
print(df_count)

Matriks Frekuensi Kata (Term Frequency Matrix):
           19  covid  dianjurkan  dinas  fasilitas  informasi  jadwal  \
Kalimat 1   1      1           0      1          0          0       0   
Kalimat 2   0      0           1      0          1          1       1   

           kabupaten  kesehatan  kota  layanan  mencari  pelayanan  pos  \
Kalimat 1          1          1     1        0        0          1    1   
Kalimat 2          0          1     0        1        1          2    1   

           puskesmas  sasaran  terkait  vaksinasi  
Kalimat 1          1        0        0          1  
Kalimat 2          0        1        1          1  




In [None]:
# Inisialisasi TfidfVectorizer dengan stopwords Bahasa Indonesia
tfidf_vectorizer = TfidfVectorizer(stop_words=list_stopwords)

# Terapkan TfidfVectorizer ke korpus
# Fiting dan transform: Membangun kosakata dan menghitung bobot TF-IDF
tfidf_matrix = tfidf_vectorizer.fit_transform(korpus)

# Ambil nama-nama kata (fitur/term)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()

# Ubah matriks hasil menjadi DataFrame
df_tfidf = pd.DataFrame(tfidf_matrix.toarray(),
                        index=[f"Kalimat {i+1}" for i in range(len(korpus))],
                        columns=tfidf_feature_names)

print("\nMatriks TF-IDF:")
print(df_tfidf)


Matriks TF-IDF:
                 19     covid  dianjurkan     dinas  fasilitas  informasi  \
Kalimat 1  0.353003  0.353003    0.000000  0.353003   0.000000   0.000000   
Kalimat 2  0.000000  0.000000    0.294325  0.000000   0.294325   0.294325   

             jadwal  kabupaten  kesehatan      kota   layanan   mencari  \
Kalimat 1  0.000000   0.353003   0.251164  0.353003  0.000000  0.000000   
Kalimat 2  0.294325   0.000000   0.209415  0.000000  0.294325  0.294325   

           pelayanan       pos  puskesmas   sasaran   terkait  vaksinasi  
Kalimat 1   0.251164  0.251164   0.353003  0.000000  0.000000   0.251164  
Kalimat 2   0.418830  0.209415   0.000000  0.294325  0.294325   0.209415  


