In [8]:
import json
import csv
import re
from collections import defaultdict, Counter
import nltk
from nltk.util import ngrams

import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

In [10]:
# Cleaning Data Function
def clean_text(text):
    cleaned = re.sub(r'[^\x00-\x7F]+', '', text)
    return cleaned.strip().lower()

In [11]:
# Model Bigram Function
def build_ngram_model(data, n=2):
    ngram_model = defaultdict(Counter)
    for item in data:
        tokens = item.split()
        for gram in ngrams(tokens, n):
            prefix = " ".join(gram[:-1])
            suffix = gram[-1]
            ngram_model[prefix][suffix] += 1
    return ngram_model

In [12]:
book_metadata = {}

In [13]:
books = []
with open('Books.csv', mode='r', encoding='utf-8') as file:
    csv_reader = csv.DictReader(file)
    for row in csv_reader:
        # Ambil judul buku dari kolom 'Book-Title' dan lakukan preprocessing
        cleaned_title = clean_text(row['Book-Title'])
        if cleaned_title:  # Pastikan judul tidak kosong
            books.append(cleaned_title)
            # Menyimpan metadata buku dengan judul yang sudah dibersihkan sebagai key
            book_metadata[cleaned_title] = {
                'title': row['Book-Title'].strip(),
                'author': row['Book-Author'].strip(),
                'year': row['Year-Of-Publication'].strip(),
                'publisher': row['Publisher'].strip(),
                'image_url': row['Image-URL-M'].strip()
            }

In [14]:
# Build Model Bigram
ngram_model = build_ngram_model(books, n=2)

In [6]:
# Save model
with open('ngram_model.json', 'w') as f:
    json.dump(ngram_model, f)

with open('book_metadata.json', 'w') as f:
    json.dump(book_metadata, f)