In [None]:

import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report
import joblib

file_path = r"C:\Users\Naomi Natasya\Documents\streamlit-category-product\data\gabungan_data20-24.xlsx"
data = pd.read_excel(file_path)

# Fungsi pembersihan nama produk
def clean_product_name(name):
    if isinstance(name, str):
        name = re.sub(r'[-/xX]', ' ', name)
        name = re.sub(r'\d+', '', name)
        name = re.sub(r'[^a-zA-Z\s]', '', name)
        name = name.lower()
        name = ' '.join([word for word in name.split() if len(word) > 1])
        name = re.sub(r'\s+', ' ', name).strip()
    return name
data['Nama Produk'] = data['Nama Produk'].apply(clean_product_name)
data_cleaned = data.dropna(subset=['Nama Produk', 'Kategori Produk Baru Final'])

# Filter kategori dengan jumlah data ≥ 30
kategori_counts = data_cleaned['Kategori Produk Baru Final'].value_counts()
kategori_terpilih = kategori_counts[kategori_counts >= 30].index
data_filtered = data_cleaned[data_cleaned['Kategori Produk Baru Final'].isin(kategori_terpilih)].reset_index(drop=True)

# Encoding
label_encoder = LabelEncoder()
data_filtered['Kategori Produk Encoded'] = label_encoder.fit_transform(data_filtered['Kategori Produk Baru Final'])

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data_filtered['Nama Produk'])
y = data_filtered['Kategori Produk Encoded']

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model SVM
svm = SVC(kernel='linear', probability=True)
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)

# Report
print("\nClassification Report - SVM Model:")
print(classification_report(y_test, y_pred_svm))

# Save model dan lainnya
joblib.dump(svm, 'svm_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')
joblib.dump(X_train, 'X_train.pkl')
joblib.dump(X_test, 'X_test.pkl')

svm = joblib.load('svm_model.pkl')
vectorizer = joblib.load('vectorizer.pkl')
label_encoder = joblib.load('label_encoder.pkl')
X_train = joblib.load('X_train.pkl')


from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def clean_product_name(name):
    if isinstance(name, str):
        name = re.sub(r'[^a-zA-Z0-9\s/]', '', name)
        name = re.sub(r'/', ' ', name)
        name = name.lower()
    return name

def predict_category_with_threshold(product_name, model, vectorizer, label_encoder, X_train, threshold=0.5):
    product_name_cleaned = clean_product_name(product_name)
    product_name_vectorized = vectorizer.transform([product_name_cleaned])
    cosine_similarities = cosine_similarity(product_name_vectorized, X_train)
    max_similarity = np.max(cosine_similarities)
    if max_similarity < threshold:
        return "Not Available"
    predicted_category_encoded = model.predict(product_name_vectorized)
    predicted_category = label_encoder.inverse_transform(predicted_category_encoded)
    return predicted_category[0]


def process_and_predict(file_path):
    data = pd.read_excel(file_path)

    if 'Nama Produk' not in data.columns:
        print("Data tidak memiliki kolom 'Nama Produk'.")
        return

    data = data.dropna(subset=['Nama Produk'])
    data['Nama Produk Asli'] = data['Nama Produk']
    data['Nama Produk Bersih'] = data['Nama Produk'].apply(clean_product_name)

    data['Kategori Produk'] = data['Nama Produk Bersih'].apply(
        lambda name: predict_category_with_threshold(name, svm, vectorizer, label_encoder, X_train, threshold=0.5)
    )

    data = data.drop(columns=['Nama Produk', 'Nama Produk Bersih'])
    data = data.rename(columns={'Nama Produk Asli': 'Nama Produk'})

    file_path_predict = r"C:\Users\Naomi Natasya\Downloads\20-22.xlsx" # path data prediksi 
    process_and_predict(file_path_predict)

    output_file = r"C:\Users\Naomi Natasya\Downloads\predicted_product_categories_svm.xlsx"
    data.to_excel(output_file, index=False)
    print(f"File dengan prediksi kategori telah disimpan sebagai '{output_file}'.")
