In [5]:
import pandas as pd
import re # Untuk regular expressions (membersihkan teks)
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer # Atau WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB # Contoh model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import zipfile
import os

# Mungkin perlu mengunduh data NLTK jika belum ada
# nltk.download('stopwords')
# nltk.download('punkt') # Diperlukan untuk tokenisasi

In [6]:
import pandas as pd
import zipfile
import os
from google.colab import drive # Import library untuk mount Google Drive

# --- LANGKAH PENTING: Mount Google Drive (jika belum) ---
# Pastikan Google Drive sudah terhubung sebelum menjalankan ini
try:
    # Cek jika sudah di-mount untuk menghindari error jika dijalankan ulang
    if not os.path.isdir("/content/drive/MyDrive"):
      drive.mount('/content/drive')
      print("Google Drive berhasil di-mount.")
    else:
      print("Google Drive sudah di-mount sebelumnya.")
except Exception as e:
    print(f"Gagal me-mount Google Drive: {e}")
    # Pertimbangkan untuk menghentikan eksekusi jika Drive gagal di-mount
    # raise SystemExit("Mounting Google Drive gagal, tidak dapat melanjutkan.")

# --- Path File Zip dan Folder Tujuan Ekstraksi ---
# Path file zip Anda di Google Drive
zip_file_path = '/content/drive/MyDrive/senpro-ai/resume-dataset.zip'
# Path folder tujuan ekstraksi di Google Drive (folder yang sama)
extract_path = '/content/drive/MyDrive/senpro-ai/'

# --- Proses Ekstraksi Langsung ke Google Drive dan Memuat Data ---
try:
    # Pastikan folder tujuan (extract_path) ada. Jika tidak, script zipfile akan membuatnya.
    # Cek apakah file zip ada sebelum mencoba membuka
    if not os.path.exists(zip_file_path):
         print(f"Error: File zip tidak ditemukan di path: {zip_file_path}")
         print("Pastikan path sudah benar dan Google Drive sudah di-mount.")
    else:
        print(f"Mencoba membuka file zip dari: {zip_file_path}")
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            # Ekstrak semua isi zip ke folder extract_path di Google Drive
            zip_ref.extractall(extract_path)
        print(f"File berhasil diekstrak ke: {extract_path}")

        # --- Mencari dan Memuat File CSV dari Lokasi Ekstraksi ---
        csv_file = None
        print(f"Mencari file CSV di dalam: {extract_path}")

        # List file/folder yang ada di lokasi ekstraksi (langsung di Google Drive)
        # Perlu waktu beberapa saat agar Drive sinkron setelah ekstraksi
        import time
        time.sleep(5) # Beri jeda sedikit agar Drive sync (opsional, bisa disesuaikan)

        extracted_items = os.listdir(extract_path)
        print(f"Item di {extract_path}: {extracted_items}")

        for item in extracted_items:
             full_item_path = os.path.join(extract_path, item)
             # Cari file CSV langsung di folder senpro-ai
             if os.path.isfile(full_item_path) and item.endswith(".csv"):
                 csv_file = full_item_path
                 print(f"File CSV ditemukan: {csv_file}")
                 break
             # Jika CSV ada di dalam subfolder hasil ekstraksi
             elif os.path.isdir(full_item_path) and item != 'resume-dataset.zip': # Jangan masuk ke zip lagi
                print(f"Memeriksa subfolder: {full_item_path}")
                for sub_item in os.listdir(full_item_path):
                     if sub_item.endswith(".csv"):
                         csv_file = os.path.join(full_item_path, sub_item)
                         print(f"File CSV ditemukan di subfolder: {csv_file}")
                         break
                if csv_file: break # Keluar loop luar jika sudah ketemu


        if csv_file:
            print(f"Mencoba memuat file CSV dari: {csv_file}")
            df = pd.read_csv(csv_file)
            print("\nDataset berhasil dimuat:")
            print(df.head())
            print("\nInformasi Dataset:")
            df.info()
            # Anda bisa melanjutkan ke langkah berikutnya dengan DataFrame df ini
        else:
            print(f"Error: File CSV tidak ditemukan di {extract_path} atau subfoldernya setelah ekstraksi.")
            print("Pastikan file zip berisi file CSV dan cek isi folder `senpro-ai` di Google Drive Anda.")

except zipfile.BadZipFile:
    print(f"Error: File di {zip_file_path} bukan file zip yang valid atau rusak.")
    print("Coba unduh ulang atau unggah ulang file tersebut ke Google Drive.")
except FileNotFoundError:
    print(f"Error kritis: Path tidak ditemukan. Pastikan {extract_path} atau {zip_file_path} valid.")
except Exception as e:
     print(f"Terjadi error lain saat ekstraksi atau load data: {e}")

Google Drive sudah di-mount sebelumnya.
Mencoba membuka file zip dari: /content/drive/MyDrive/senpro-ai/resume-dataset.zip
File berhasil diekstrak ke: /content/drive/MyDrive/senpro-ai/
Mencari file CSV di dalam: /content/drive/MyDrive/senpro-ai/
Item di /content/drive/MyDrive/senpro-ai/: ['resume-dataset.zip', 'Resume', 'data']
Memeriksa subfolder: /content/drive/MyDrive/senpro-ai/Resume
File CSV ditemukan di subfolder: /content/drive/MyDrive/senpro-ai/Resume/Resume.csv
Mencoba memuat file CSV dari: /content/drive/MyDrive/senpro-ai/Resume/Resume.csv

Dataset berhasil dimuat:
         ID                                         Resume_str  \
0  16852973           HR ADMINISTRATOR/MARKETING ASSOCIATE\...   
1  22323967           HR SPECIALIST, US HR OPERATIONS      ...   
2  33176873           HR DIRECTOR       Summary      Over 2...   
3  27018550           HR SPECIALIST       Summary    Dedica...   
4  17812897           HR MANAGER         Skill Highlights  ...   

                     

In [12]:
!pip install nltk
import nltk

# --- TAMBAHKAN BARIS INI ---
# Mengunduh paket stopwords, punkt, and punkt_tab (untuk tokenizer) dari NLTK
try:
    nltk.data.find('corpora/stopwords')
    print("Resource 'stopwords' sudah ada.")
except LookupError:
    print("Resource 'stopwords' belum ada, mengunduh...")
    nltk.download('stopwords', quiet=True) # quiet=True agar tidak terlalu verbose
    print("Selesai mengunduh 'stopwords'.")

try:
    nltk.data.find('tokenizers/punkt')
    print("Resource 'punkt' sudah ada.")
except LookupError:
    print("Resource 'punkt' belum ada, mengunduh...")
    nltk.download('punkt', quiet=True)
    print("Selesai mengunduh 'punkt'.")

# Download punkt_tab dataset
try:
    nltk.data.find('tokenizers/punkt_tab')
    print("Resource 'punkt_tab' sudah ada.")
except LookupError:
    print("Resource 'punkt_tab' belum ada, mengunduh...")
    nltk.download('punkt_tab', quiet=True)
    print("Selesai mengunduh 'punkt_tab'.")

# --------------------------

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# --- Sisa kode Langkah 3 Anda (seperti sebelumnya) ---

# Gunakan DataFrame 'df' yang sudah dimuat dari langkah sebelumnya
if 'df' in locals():
    # Ambil kolom yang relevan
    df['Resume_str'] = df['Resume_str'].fillna('')
    df['Category'] = df['Category'].fillna('')

    # Definisikan ulang fungsi pembersihan jika perlu
    stop_words = set(stopwords.words('english')) # Baris ini sekarang seharusnya bekerja
    def clean_resume_text(text):
        text = re.sub(r'http\S+', ' ', text)
        text = re.sub(r'@\S+', ' ', text)
        text = re.sub(r'#\S+', ' ', text)
        text = re.sub(r'RT|cc', ' ', text)
        text = re.sub(r'[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"""), ' ', text)
        text = re.sub(r'[^\x00-\x7f]',r' ', text)
        text = re.sub(r'\s+', ' ', text)
        text = text.lower()
        words = word_tokenize(text) # Membutuhkan 'punkt'
        cleaned_words = [word for word in words if word.isalpha() and word not in stop_words and len(word) > 1]
        return ' '.join(cleaned_words)

    # Terapkan pembersihan ke kolom 'Resume_str'
    print("Membersihkan teks resume...")
    df['Cleaned_Resume'] = df['Resume_str'].apply(clean_resume_text)

    print("\nContoh Resume setelah dibersihkan:")
    print(df[['Resume_str', 'Cleaned_Resume']].head())

    print("\nJumlah data per Kategori:")
    print(df['Category'].value_counts())

else:
    print("Error: DataFrame 'df' tidak ditemukan. Jalankan langkah pemuatan data terlebih dahulu.")

Resource 'stopwords' sudah ada.
Resource 'punkt' sudah ada.
Resource 'punkt_tab' belum ada, mengunduh...
Selesai mengunduh 'punkt_tab'.
Membersihkan teks resume...

Contoh Resume setelah dibersihkan:
                                          Resume_str  \
0           HR ADMINISTRATOR/MARKETING ASSOCIATE\...   
1           HR SPECIALIST, US HR OPERATIONS      ...   
2           HR DIRECTOR       Summary      Over 2...   
3           HR SPECIALIST       Summary    Dedica...   
4           HR MANAGER         Skill Highlights  ...   

                                      Cleaned_Resume  
0  hr administrator marketing associate hr admini...  
1  hr specialist us hr operations summary versati...  
2  hr director summary years experience recruitin...  
3  hr specialist summary dedicated driven dynamic...  
4  hr manager skill highlights hr skills hr depar...  

Jumlah data per Kategori:
Category
INFORMATION-TECHNOLOGY    120
BUSINESS-DEVELOPMENT      120
ADVOCATE                  118
CHEF   

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

if 'df' in locals() and 'Cleaned_Resume' in df.columns:
    # Inisialisasi TF-IDF Vectorizer
    # Anda bisa mengatur parameter seperti max_features, ngram_range, dll.
    tfidf_vectorizer = TfidfVectorizer(max_features=5000, # Batasi jumlah fitur (kata unik)
                                       ngram_range=(1,2)) # Pertimbangkan 1 kata dan 2 kata berurutan

    print("\nMembuat matriks TF-IDF...")
    # Buat matriks TF-IDF dari teks resume yang bersih
    X = tfidf_vectorizer.fit_transform(df['Cleaned_Resume'])

    # Target variabel (kategori pekerjaan)
    y = df['Category']

    print("Matriks TF-IDF (fitur) berhasil dibuat.")
    print("Ukuran matriks X:", X.shape)
    print("Ukuran target y:", y.shape)

    # Simpan vocabulary (kata-kata unik) jika perlu dilihat
    # feature_names = tfidf_vectorizer.get_feature_names_out()
    # print("Contoh fitur (kata):", feature_names[:20])

else:
    print("Error: DataFrame 'df' atau kolom 'Cleaned_Resume' tidak ditemukan.")


Membuat matriks TF-IDF...
Matriks TF-IDF (fitur) berhasil dibuat.
Ukuran matriks X: (2484, 5000)
Ukuran target y: (2484,)


In [14]:
from sklearn.model_selection import train_test_split

if 'X' in locals() and 'y' in locals():
    # Bagi data (80% latih, 20% uji)
    # stratify=y penting untuk menjaga proporsi kelas di kedua set jika data tidak seimbang
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.2,
                                                        random_state=42,
                                                        stratify=y)

    print("\nData berhasil dibagi:")
    print("Ukuran data latih (X_train):", X_train.shape)
    print("Ukuran data uji (X_test):", X_test.shape)
    print("Ukuran label latih (y_train):", y_train.shape)
    print("Ukuran label uji (y_test):", y_test.shape)
else:
    print("Error: Variabel X atau y belum terdefinisi. Jalankan langkah sebelumnya.")


Data berhasil dibagi:
Ukuran data latih (X_train): (1987, 5000)
Ukuran data uji (X_test): (497, 5000)
Ukuran label latih (y_train): (1987,)
Ukuran label uji (y_test): (497,)


In [15]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

if 'X_train' in locals():
    # Pilih model
    model = MultinomialNB()

    print("\nMelatih model Multinomial Naive Bayes...")
    # Latih model
    model.fit(X_train, y_train)
    print("Model berhasil dilatih.")

    # Evaluasi cepat di data latih (opsional, untuk cek overfitting)
    # y_train_pred = model.predict(X_train)
    # print(f"Akurasi di data latih: {accuracy_score(y_train, y_train_pred):.4f}")

else:
    print("Error: Data latih (X_train, y_train) belum terdefinisi.")


Melatih model Multinomial Naive Bayes...
Model berhasil dilatih.


In [16]:
if 'model' in locals() and 'X_test' in locals():
    print("\nMengevaluasi model pada data uji...")
    # Prediksi pada data uji
    y_pred = model.predict(X_test)

    # Evaluasi performa
    accuracy = accuracy_score(y_test, y_pred)
    print(f"\nAkurasi Model pada Data Uji: {accuracy:.4f}")

    print("\nLaporan Klasifikasi Lengkap:")
    # target_names bisa diisi dengan list unik dari df['Category'].unique() jika mau nama kelas di laporan
    print(classification_report(y_test, y_pred)) #, target_names=df['Category'].unique()))

    # Bisa juga tampilkan Confusion Matrix
    # from sklearn.metrics import confusion_matrix
    # import seaborn as sns
    # import matplotlib.pyplot as plt
    # cm = confusion_matrix(y_test, y_pred)
    # plt.figure(figsize=(12, 10))
    # sns.heatmap(cm, annot=True, fmt='d', xticklabels=model.classes_, yticklabels=model.classes_)
    # plt.xlabel('Predicted')
    # plt.ylabel('Actual')
    # plt.show()

else:
     print("Error: Model atau data uji belum siap untuk evaluasi.")


Mengevaluasi model pada data uji...

Akurasi Model pada Data Uji: 0.5714

Laporan Klasifikasi Lengkap:
                        precision    recall  f1-score   support

            ACCOUNTANT       0.53      0.88      0.66        24
              ADVOCATE       0.33      0.42      0.37        24
           AGRICULTURE       1.00      0.08      0.14        13
               APPAREL       0.50      0.05      0.10        19
                  ARTS       0.75      0.14      0.24        21
            AUTOMOBILE       0.00      0.00      0.00         7
              AVIATION       0.83      0.62      0.71        24
               BANKING       0.87      0.57      0.68        23
                   BPO       0.00      0.00      0.00         4
  BUSINESS-DEVELOPMENT       0.41      0.88      0.56        24
                  CHEF       0.81      0.71      0.76        24
          CONSTRUCTION       0.68      0.77      0.72        22
            CONSULTANT       1.00      0.09      0.16        23

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
import joblib
import os

# Pastikan Google Drive masih terhubung
# Tentukan path untuk menyimpan model dan vectorizer di folder senpro-ai
save_dir = '/content/drive/MyDrive/senpro-ai/'
model_path = os.path.join(save_dir, 'resume_category_model.joblib')
vectorizer_path = os.path.join(save_dir, 'tfidf_vectorizer.joblib')

if 'model' in locals() and 'tfidf_vectorizer' in locals():
    try:
        print(f"\nMenyimpan model ke: {model_path}")
        joblib.dump(model, model_path)
        print("Model berhasil disimpan.")

        print(f"Menyimpan TF-IDF vectorizer ke: {vectorizer_path}")
        joblib.dump(tfidf_vectorizer, vectorizer_path)
        print("Vectorizer berhasil disimpan.")

        print("\nKedua file (model dan vectorizer) telah disimpan di Google Drive.")
        print("Anda akan memerlukan kedua file ini untuk aplikasi backend/frontend.")

    except Exception as e:
        print(f"Terjadi error saat menyimpan file: {e}")
else:
    print("Error: Model atau TF-IDF Vectorizer belum dilatih/dibuat.")


Menyimpan model ke: /content/drive/MyDrive/senpro-ai/resume_category_model.joblib
Model berhasil disimpan.
Menyimpan TF-IDF vectorizer ke: /content/drive/MyDrive/senpro-ai/tfidf_vectorizer.joblib
Vectorizer berhasil disimpan.

Kedua file (model dan vectorizer) telah disimpan di Google Drive.
Anda akan memerlukan kedua file ini untuk aplikasi backend/frontend.


In [6]:
!pip install PyPDF2
!pip install nltk
import nltk
import PyPDF2
import joblib
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os

# Download stopwords if not already present
nltk.download('stopwords', quiet=True)
# Download the 'punkt_tab' dataset
nltk.download('punkt_tab', quiet=True)

# Load model dan vectorizer (sesuaikan path jika perlu)
model_path = '/content/drive/MyDrive/senpro-ai/resume_category_model.joblib'
vectorizer_path = '/content/drive/MyDrive/senpro-ai/tfidf_vectorizer.joblib'
model = joblib.load(model_path)
tfidf_vectorizer = joblib.load(vectorizer_path)

stop_words = set(stopwords.words('english'))

def clean_resume_text(text):
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'@\S+', ' ', text)
    text = re.sub(r'#\S+', ' ', text)
    text = re.sub(r'RT|cc', ' ', text)
    text = re.sub(r'[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text)
    text = re.sub(r'[^\x00-\x7f]',r' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    words = word_tokenize(text)
    cleaned_words = [word for word in words if word.isalpha() and word not in stop_words and len(word) > 1]
    return ' '.join(cleaned_words)

def extract_text_from_pdf(pdf_path):
    text = ""
    try:
        with open(pdf_path, 'rb') as file:
            reader = PyPDF2.PdfReader(file)
            for page_num in range(len(reader.pages)):
                text += reader.pages[page_num].extract_text()
    except Exception as e:
        print(f"Error extracting text from PDF: {e}")
    return text

def predict_category(resume_text):
    cleaned_text = clean_resume_text(resume_text)
    text_vectorized = tfidf_vectorizer.transform([cleaned_text])
    predicted_category = model.predict(text_vectorized)[0]
    return predicted_category

# Contoh: Asumsikan PDF ada di subfolder "Resume" di Google Drive Anda
# Ambil daftar file di direktori
pdf_directory = '/content/drive/MyDrive/senpro-ai/'  # Path to the directory containing the PDF
pdf_file_name = 'Resume.pdf'  # Name of the PDF file

# Define pdf_files here by listing the PDF files in the directory
pdf_files = [f for f in os.listdir(pdf_directory) if f.endswith('.pdf')]


if pdf_files:
    for pdf_file in pdf_files:
        pdf_file_path = os.path.join(pdf_directory, pdf_file)
        if os.path.exists(pdf_file_path):
            resume_text = extract_text_from_pdf(pdf_file_path)
            if resume_text:
                predicted_category = predict_category(resume_text)
                print(f"\nFile: {pdf_file}")
                print(f"Kategori yang Diprediksi: {predicted_category}")
            else:
                print(f"Gagal mengekstrak teks dari {pdf_file}")
        else:
            print(f"Error: File tidak ditemukan di {pdf_file_path}")
else:
    print("Tidak ada file PDF ditemukan di direktori yang ditentukan.")


File: Resume.pdf
Kategori yang Diprediksi: SALES

File: CV ATS Muhammad Rendy.pdf
Kategori yang Diprediksi: INFORMATION-TECHNOLOGY

File: Blue Simple Modern Resume.pdf
Kategori yang Diprediksi: INFORMATION-TECHNOLOGY
