In [None]:
# import library
import os
import pickle
import pandas as pd
from data import get_dataset
from app.preprocessing import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# import dataset tempat destinasi wisata
df = get_dataset()

## **Exploratory Data Analysis**

### Check Basic Dataset Information

In [None]:
# cek 5 data pertama dalam dataset
df.head()

In [None]:
# cek nama kolom yang terdapat pada dataset
df.columns

In [None]:
# melihat informasi dasar dataset
df.info()

In [None]:
# melihat jumlah nilai unik pada setiap kolom
df.nunique()

### Check Missing Data

In [None]:
# cek jumlah data kosong pada setiap kolom
df.isna().sum()

### Check Duplicate Data

In [None]:
# cek data duplikat
df[df.duplicated()==True]

### Check Descriptive Statistics Value

In [None]:
# cek nilai statistik deskriptif pada setiap kolom
df.describe().T

In [None]:
# menghapus data yang tidak digunakan
drop_cols = ['Price', 'Rating', 'Time_Minutes', 'Coordinate', 'Lat', 'Long', 'Unnamed: 11', 'Unnamed: 12']

df.drop(drop_cols, axis=1, inplace=True)
df.head()

## **Data Preprocessing**

### Stopword, Stemming, and Lemmatization

In [None]:
if os.path.exists("data/content_based_filtering_data.csv"):
    # menggunakan dataset yang sudah dibuat sebelumnya
    df = pd.read_csv("data/content_based_filtering_data.csv")
else:
    # membuat kolom baru 'Tags' yang berisi data yang sudah diproses
    df['Tags'] = df['Description'].apply(lambda x: Pipeline(input_text=x, tfidf_step=False))
    df.to_csv('data/content_based_filtering_data.csv', index=False)

### TF-IDF (Term Frequency-Inverse Document Frequency)

In [None]:
# menghitung kata yang penting pada sebuah data menggunakan TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['Tags'])
tfidf_matrix.shape

In [None]:
# hasil pemrosesan data menggunakan TF-IDF
pd.DataFrame(
    tfidf_matrix.todense(),
    columns= vectorizer.get_feature_names_out(),
    index=df['Place_Name']
).sample(5)

## **Save Data Hasil Preprocessing**

In [None]:
with open("data/vectorizer.pkl", 'wb') as f:
    pickle.dump(vectorizer, f)

with open("data/tfidf_matrix.pkl", "wb") as f:
    pickle.dump(tfidf_matrix, f)