# Merge File

In [89]:
# Daftar nama file yang akan digabungkan
file_names = [
    'corpus_Nurul_file_6.txt',      # 267 baris
    'corpus_Hafizha_file_7.txt',    # 225 baris
    'corpus_Aida_file_8.txt',       # 236 baris
    'corpus_Maria_file_9.txt',      # 210 baris
    'corpus_Fadhilah_file_10.txt'   # 230 baris
]

# Nama file output
output_file = 'corpus_kelompok_2.txt'

# Menggabungkan file
with open(output_file, 'w', encoding='utf-8') as outfile:
    for file_name in file_names:
        with open(file_name, 'r', encoding='utf-8') as infile:
            # Membaca konten dari file dan menuliskannya ke file output
            outfile.write(infile.read())
            outfile.write('\n')  # Menambahkan newline setelah setiap file

print(f"File berhasil digabungkan menjadi {output_file}")

File berhasil digabungkan menjadi corpus_kelompok_2.txt


# Pre-processing

In [93]:
import re

# Membaca file teks
input_file = '/content/corpus_kelompok_2.txt'
output_file = 'clean_corpus_kelompok_2.txt'

# Fungsi untuk membersihkan tanda baca yang tidak diinginkan dan menambahkan titik jika belum ada
def clean_and_add_period(line):
    # Lowercasing
    line = line.lower()

    # Menghilangkan tanda baca yang tidak diinginkan
    line = re.sub(r'[,\?!’‘()-]', '', line)  # Hilangkan koma, tanda tanya, tanda seru, karakter yang tidak diinginkan, dan tanda kurung
    line = re.sub(r'\.\.+', '.', line)  # Hilangkan titik ganda menjadi satu titik
    line = line.strip() # Hilangkan spasi di awal/akhir

    # Tambahkan titik di akhir kalimat jika tidak ada
    if not line.endswith('.'):
        return line + '.'
    return line

# Baca file, modifikasi, dan tulis kembali ke file baru
with open(input_file, 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Proses setiap baris untuk membersihkan dan menambahkan titik jika belum ada
cleaned_lines = [clean_and_add_period(line) for line in lines]

# Menyimpan hasil ke file baru
with open(output_file, 'w', encoding='utf-8') as file:
    file.write('\n'.join(cleaned_lines))

print("Proses selesai. Hasil perubahan disimpan di 'hasil_perubahan_titik_bersih.txt'.")


Proses selesai. Hasil perubahan disimpan di 'hasil_perubahan_titik_bersih.txt'.


# Tokenisasi

In [94]:
import pandas as pd
import gensim
from gensim.models import Word2Vec

# Convert the cleaned data into a pandas DataFrame
df = pd.DataFrame(cleaned_lines, columns=['Sentence'])

# Show the first few rows of the DataFrame
df.head()

Unnamed: 0,Sentence
0,kèe meunyo na umu kuwoe u gampông lheuh buet nyoe.
1,kèe han kutém lagèe nyan.
2,uroe nyoe di kèe hana kujak saho.
3,lôn tuan han jeuet lonteuka uroe nyoe.
4,kon baroe lôn han jeuet lonjak tapi uroe nyoe.


In [95]:
# Tokenization words in each sentence
def tokenize(sentences):
    tokenized_sentences = []

    for sentence in sentences:
        # Tokenize each sentence into words
        tokens = sentence.split()  # Splitting by whitespace (or use nltk.word_tokenize if preferred)

        # Append the tokens as a cleaned sentence
        tokenized_sentences.append(tokens)

    return tokenized_sentences

# Apply tokenization and remove duplicates
tokenized_sentences = tokenize(cleaned_lines)

# Convert tokenized sentences into a DataFrame for better viewing
df = pd.DataFrame({'Tokenized_Sentence': tokenized_sentences})

# Show the first few rows of the DataFrame
df.head()

Unnamed: 0,Tokenized_Sentence
0,"[kèe, meunyo, na, umu, kuwoe, u, gampông, lheuh, buet, nyoe.]"
1,"[kèe, han, kutém, lagèe, nyan.]"
2,"[uroe, nyoe, di, kèe, hana, kujak, saho.]"
3,"[lôn, tuan, han, jeuet, lonteuka, uroe, nyoe.]"
4,"[kon, baroe, lôn, han, jeuet, lonjak, tapi, uroe, nyoe.]"


In [96]:
# Save the processed data to CSV (optional)
output_path = 'tokenized_aceh_data.csv'
df.to_csv(output_path, index=False)

print(f"Tokenized data has been saved to {output_path}")

Tokenized data has been saved to tokenized_aceh_data.csv


# Embedding

In [97]:
from gensim.models import Word2Vec

# define the model
model_aceh= Word2Vec(
    window=10,
    min_count=1,
    workers=4,
    epochs=10,
)

# Build the vocabulary from the tokenized data
model_aceh.build_vocab(tokenized_sentences, progress_per=1000)

# Train the model
model_aceh.train(tokenized_sentences, total_examples=model_aceh.corpus_count, epochs=model_aceh.epochs)

# Save the model to a file (adjust the path as needed)
model_aceh_path = 'word2vec_aceh.model'
model_aceh.save(model_aceh_path)

print(f"Word2Vec model has been saved to {model_aceh_path}")

# Load the model back (if needed)
model_aceh = Word2Vec.load(model_aceh_path)

Word2Vec model has been saved to word2vec_aceh.model


In [102]:
# Contoh penggunaan model
word = "aneuk"  # Ganti dengan kata yang ingin dicari
try:
    vector = model_aceh.wv[word]
    print(f"Vektor untuk '{word}': {vector}")
except KeyError:
    print(f"Kata '{word}' tidak ditemukan dalam model.")

Vektor untuk 'aneuk': [-0.22806996  0.4579311   0.15495722 -0.02676234  0.16869253 -0.5134049
  0.1983177   0.6475737  -0.41194525 -0.28528365 -0.11295079 -0.6994653
 -0.06088865  0.24311998  0.00823858 -0.2931435   0.06319604 -0.29899505
 -0.08818883 -0.7967335   0.00430636  0.23564915  0.33755338 -0.14866976
 -0.07961657  0.0253758  -0.29232663 -0.16285051 -0.3670021  -0.0307719
  0.43891802 -0.01430646  0.25104216 -0.32166478 -0.14981683  0.6392862
  0.02894215 -0.14810608 -0.14213185 -0.65384686  0.12213247 -0.38939738
 -0.2277098   0.08083359  0.36684433  0.01277732 -0.32687068 -0.17409073
  0.22824557  0.26285988  0.06731458 -0.3689039  -0.02509371 -0.10438187
 -0.24808393  0.15234901  0.10792332 -0.07610061 -0.37145245  0.17609589
  0.11065405  0.17244996  0.1055896   0.02793368 -0.36665702  0.3477891
  0.03798808  0.34615996 -0.3781263   0.4707935  -0.08193389  0.27614945
  0.35015702  0.0164349   0.4623286   0.14087027 -0.07531565  0.02230636
 -0.24017389  0.00442862 -0.238089

In [103]:
# Contoh penggunaan model
word = 'aneuk'  # Ganti dengan kata yang diinginkan

try:
    # Mencari kata-kata yang mirip
    similar_words = model_aceh.wv.most_similar(word, topn=5)  # Mencari 5 kata teratas yang mirip

    # Menampilkan hasil
    print(f"Kata yang mirip dengan '{word}':")
    for similar_word, similarity in similar_words:
        print(f"{similar_word}: {similarity:.4f}")
except KeyError:
    print(f"Kata '{word}' tidak ditemukan dalam model.")

Kata yang mirip dengan 'aneuk':
nyang: 0.9997
nyan: 0.9997
bak: 0.9997
ka: 0.9996
deungon: 0.9996
