In [1]:
!pip install PySastrawi

Collecting PySastrawi
  Downloading PySastrawi-1.2.0-py2.py3-none-any.whl.metadata (892 bytes)
Downloading PySastrawi-1.2.0-py2.py3-none-any.whl (210 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.6/210.6 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PySastrawi
Successfully installed PySastrawi-1.2.0


In [1]:
import pandas as pd
import numpy as np
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import os
import pickle
import re
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import BernoulliNB

#### Import Dataset

In [2]:
data_path = '/content/dataset'
# data_path = '...\Flask-server\dataset'

labels = []
tokens = []
titles = []

for f in os.listdir(data_path):
    full_path = os.path.join(data_path, f)
    if os.path.isfile(full_path):
        titles.append(f.split('.')[0])
        with open(full_path, 'r', encoding='utf8', errors='ignore') as infile:
            for line in infile:
                tokens.append(line.strip())
                labels.append(len(titles) - 1)

df = pd.DataFrame(list(zip(tokens, labels)), columns=['sent', 'label'])

In [3]:
df

Unnamed: 0,sent,label
0,DevOps mengintegrasikan pengembangan perangkat...,0
1,Praktik DevOps berfokus pada otomatisasi prose...,0
2,"Dengan DevOps, tim dapat mempercepat siklus ri...",0
3,DevOps menciptakan lingkungan kerja yang lebih...,0
4,Penggunaan alat-alat seperti Jenkins dan Docke...,0
...,...,...
1002,"Dalam pengembangan aplikasi iOS, Anda harus me...",4
1003,Keterampilan dalam pemrograman iOS memberi And...,4
1004,Pemrograman iOS melibatkan pembuatan aplikasi ...,4
1005,"Dengan keterampilan dalam pemrograman iOS, And...",4


In [4]:
titles

['devops', 'backend', 'frontend', 'android', 'ios']

In [5]:
df.sample(20)

Unnamed: 0,sent,label
775,Pemrograman Android melibatkan penggunaan tekn...,3
780,"Dengan memanfaatkan Android's Room Database, p...",3
390,Penggunaan arsitektur serverless dalam backend...,1
136,Integrasi antara tim pengembang dan operasi da...,0
442,Seorang front-end developer harus memahami pri...,2
205,DevOps berfokus pada pembuatan proses pengemba...,0
736,"Dengan Android's CameraX API, pengembang dapat...",3
351,Pembangunan backend melibatkan pembuatan mekan...,1
338,Pembangunan backend melibatkan pembuatan siste...,1
279,Backend sering menggunakan teknik load balanci...,1


#### Data Cleaning

In [6]:
stemmer_factory = StemmerFactory()
stemmer = stemmer_factory.create_stemmer()

stop_factory = StopWordRemoverFactory()
stop_words = stop_factory.get_stop_words()

In [7]:
def preprocess_text(text):
    cleaned_text = " ".join([stemmer.stem(word) for word in re.sub("[^a-zA-Z]", " ", text).split() if word.lower() not in stop_words])
    return cleaned_text.lower()

df['cleaned'] = df['sent'].apply(preprocess_text)


In [8]:
df

Unnamed: 0,sent,label,cleaned
0,DevOps mengintegrasikan pengembangan perangkat...,0,devops integrasi kembang perangkat lunak opera...
1,Praktik DevOps berfokus pada otomatisasi prose...,0,praktik devops fokus otomatisasi proses kemban...
2,"Dengan DevOps, tim dapat mempercepat siklus ri...",0,devops tim cepat siklus rilis perangkat lunak ...
3,DevOps menciptakan lingkungan kerja yang lebih...,0,devops cipta lingkung kolaboratif kembang tim ...
4,Penggunaan alat-alat seperti Jenkins dan Docke...,0,guna alat alat jenkins docker implementasi pri...
...,...,...,...
1002,"Dalam pengembangan aplikasi iOS, Anda harus me...",4,kembang aplikasi ios paham alat framework bang...
1003,Keterampilan dalam pemrograman iOS memberi And...,4,terampil pemrograman ios mampu teknologi baru ...
1004,Pemrograman iOS melibatkan pembuatan aplikasi ...,4,pemrograman ios libat buat aplikasi manfaat fi...
1005,"Dengan keterampilan dalam pemrograman iOS, And...",4,terampil pemrograman ios kembang aplikasi manf...


In [9]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

tfidf = TfidfVectorizer(max_features=1000)

X_tfidf = tfidf.fit_transform(df['cleaned'])

selector = SelectKBest(chi2, k=800)
X_selected = selector.fit_transform(X_tfidf, df['label'])

In [10]:
print(X_tfidf.shape)

(1007, 872)


In [11]:
print(X_selected.shape)

(1007, 800)


#### Data Training

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X_selected, df['label'], test_size=0.2, random_state=42, stratify=df['label'])

In [23]:
# model = LogisticRegression(max_iter=1000)
model = MultinomialNB()
# model = BernoulliNB()


pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=1000)),
    ('select', SelectKBest(chi2, k=600)),
    ('classifier', MultinomialNB())
])

In [24]:
pipeline.fit(df['cleaned'], df['label'])
model.fit(x_train, y_train)

In [25]:
y_pred = model.predict(x_test)

print(classification_report(y_test, y_pred))

print(confusion_matrix(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.96      1.00      0.98        45
           1       0.98      0.93      0.95        43
           2       1.00      1.00      1.00        39
           3       1.00      0.94      0.97        33
           4       0.93      0.98      0.95        42

    accuracy                           0.97       202
   macro avg       0.97      0.97      0.97       202
weighted avg       0.97      0.97      0.97       202

[[45  0  0  0  0]
 [ 2 40  0  0  1]
 [ 0  0 39  0  0]
 [ 0  0  0 31  2]
 [ 0  1  0  0 41]]


In [26]:
def predict_text(text):
    cleaned_text = preprocess_text(text)
    prediction = pipeline.predict([cleaned_text])
    return titles[prediction[0]]

In [27]:
new_text = "saya mempunyai pengalaman melakukan pemrograman pada produk apple"
predicted_label = predict_text(new_text)
print(predicted_label)

ios


In [28]:
new_text = "bapak jokowi adalah idola saya. saya ingin bisa menjadi pemrogram yang bekerja di bidang server seperti bapak jokowi"
predicted_label = predict_text(new_text)
print(predicted_label)

backend


In [29]:
df['label'].values

array([0, 0, 0, ..., 4, 4, 4])

In [30]:
cl = "mnb"
v = 1
titles = ['devops', 'backend', 'frontend', 'android', 'ios']
labels = df['label'].values

# Menyimpan model dengan format nama yang diberikan
filename = f'{cl}_model_v{v}_c{len(titles)}_e{int(len(labels)/len(titles))}.pickle'
with open(filename, 'wb') as f:
    pickle.dump(model, f)

print(f"Model saved as {filename}")

Model saved as mnb_model_v1_c5_e201.pickle


In [21]:
filename = 'tfidf_vectorizer.pkl'
with open(filename, 'wb') as f:
    pickle.dump(tfidf, f)

print(f"Model saved as {filename}")

Model saved as tfidf_vectorizer.pkl


In [22]:
filename = 'select_kbest.pkl'
with open(filename, 'wb') as f:
    pickle.dump(selector, f)

print(f"Model saved as {filename}")

Model saved as select_kbest.pkl
