In [1]:
# Import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import classification_report, accuracy_score



In [5]:
import chardet

# Baca beberapa byte pertama dari file untuk deteksi
with open('../data/data.csv', 'rb') as f:
    result = chardet.detect(f.read(10000))  # Baca 10KB pertama
    print(result)

# Misalnya hasil: {'encoding': 'ISO-8859-1', 'confidence': 0.73}
df = pd.read_csv('../data/data.csv', encoding=result['encoding'], header=0)
print(df.head())
print(df.columns)
df.head()  # Tampilkan beberapa baris pertama dari DataFrame


{'encoding': 'ISO-8859-1', 'confidence': 0.728467, 'language': ''}
                                               Tweet  HS  Abusive  \
0  - disaat semua cowok berusaha melacak perhatia...   1        1   
1  RT USER: USER siapa yang telat ngasih tau elu?...   0        1   
2  41. Kadang aku berfikir, kenapa aku tetap perc...   0        0   
3  USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...   0        0   
4  USER USER Kaum cebong kapir udah keliatan dong...   1        1   

   HS_Individual  HS_Group  HS_Religion  HS_Race  HS_Physical  HS_Gender  \
0              1         0            0        0            0          0   
1              0         0            0        0            0          0   
2              0         0            0        0            0          0   
3              0         0            0        0            0          0   
4              0         1            1        0            0          0   

   HS_Other  HS_Weak  HS_Moderate  HS_Strong  
0         1   

Unnamed: 0,Tweet,HS,Abusive,HS_Individual,HS_Group,HS_Religion,HS_Race,HS_Physical,HS_Gender,HS_Other,HS_Weak,HS_Moderate,HS_Strong
0,- disaat semua cowok berusaha melacak perhatia...,1,1,1,0,0,0,0,0,1,1,0,0
1,RT USER: USER siapa yang telat ngasih tau elu?...,0,1,0,0,0,0,0,0,0,0,0,0
2,"41. Kadang aku berfikir, kenapa aku tetap perc...",0,0,0,0,0,0,0,0,0,0,0,0
3,USER USER AKU ITU AKU\n\nKU TAU MATAMU SIPIT T...,0,0,0,0,0,0,0,0,0,0,0,0
4,USER USER Kaum cebong kapir udah keliatan dong...,1,1,0,1,1,0,0,0,0,0,1,0


In [6]:
print(df.columns)


Index(['Tweet', 'HS', 'Abusive', 'HS_Individual', 'HS_Group', 'HS_Religion',
       'HS_Race', 'HS_Physical', 'HS_Gender', 'HS_Other', 'HS_Weak',
       'HS_Moderate', 'HS_Strong'],
      dtype='object')


In [7]:

# Fitur (Tweet) dan target (semua label multilabel)
X = df['Tweet']
y = df[['HS', 'Abusive', 'HS_Individual', 'HS_Group', 'HS_Religion', 'HS_Race', 
        'HS_Physical', 'HS_Gender', 'HS_Other', 'HS_Weak', 'HS_Moderate', 'HS_Strong']]


In [9]:

from sklearn.feature_extraction.text import TfidfVectorizer

# Daftar stopword bahasa Indonesia manual
stopwords_indonesia = [
    'yang', 'dan', 'di', 'ke', 'dari', 'ini', 'itu', 'untuk', 'pada', 'dengan', 
    'adalah', 'atau', 'juga', 'sebagai', 'karena', 'oleh', 'tidak', 'sudah',
    'saja', 'maka', 'lagi', 'bagi', 'namun', 'setelah', 'sebelum', 'hingga', 
    'tersebut', 'setiap', 'kami', 'kita', 'mereka', 'aku', 'saya', 'anda', 
    'engkau', 'dia', 'ia', 'kalian', 'jadi', 'apakah', 'dimana', 'kapan',
    'mengapa', 'bagaimana', 'bahwa', 'tentang'
]

# TF-IDF Vectorizer dengan stopword bahasa Indonesia manual
vectorizer = TfidfVectorizer(stop_words=stopwords_indonesia, max_features=5000)

# Transform X
X_tfidf = vectorizer.fit_transform(X)



In [10]:

# Split data
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)


In [11]:

# Model Naive Bayes multilabel
nb = MultinomialNB()
multi_nb = MultiOutputClassifier(nb)
multi_nb.fit(X_train, y_train)


In [12]:

# Prediksi
y_pred = multi_nb.predict(X_test)


In [13]:

# Evaluasi per label
for i, column in enumerate(y.columns):
    print(f"\n=== Evaluasi untuk label: {column} ===")
    print(classification_report(y_test.iloc[:, i], y_pred[:, i]))
    print(f"Akurasi: {accuracy_score(y_test.iloc[:, i], y_pred[:, i]):.4f}")



=== Evaluasi untuk label: HS ===
              precision    recall  f1-score   support

           0       0.84      0.88      0.86      1516
           1       0.83      0.78      0.80      1118

    accuracy                           0.84      2634
   macro avg       0.84      0.83      0.83      2634
weighted avg       0.84      0.84      0.84      2634

Akurasi: 0.8371

=== Evaluasi untuk label: Abusive ===
              precision    recall  f1-score   support

           0       0.90      0.90      0.90      1646
           1       0.83      0.82      0.83       988

    accuracy                           0.87      2634
   macro avg       0.86      0.86      0.86      2634
weighted avg       0.87      0.87      0.87      2634

Akurasi: 0.8709

=== Evaluasi untuk label: HS_Individual ===
              precision    recall  f1-score   support

           0       0.80      0.97      0.88      1916
           1       0.80      0.36      0.49       718

    accuracy                    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
