In [1]:
import pandas as pd
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score


In [2]:

# Memuat dataset dari file CSV
df = pd.read_csv('dataset_tugas.csv')
df.head()

Unnamed: 0,tweets,class
0,Be aware dirty step to get money #staylight ...,figurative
1,#sarcasm for #people who don't understand #diy...,figurative
2,@IminworkJeremy @medsingle #DailyMail readers ...,figurative
3,@wilw Why do I get the feeling you like games?...,figurative
4,-@TeacherArthurG @rweingarten You probably jus...,figurative


In [4]:

# Noise Removal
# Noise Removal
df['cleaned_text'] = df['tweets'].apply(lambda x: re.sub(r'[^\w\s]', '', x.lower()))

df.head()

AttributeError: 'float' object has no attribute 'lower'

In [None]:
# Tokenisasi
nltk.download('punkt')  # Mengunduh data tokenizer dari NLTK
df['tokenized_text'] = df['cleaned_text'].apply(lambda x: word_tokenize(x))


In [None]:

# Stemming
stemmer = SnowballStemmer('english')  # Membuat objek stemmer
df['stemmed_text'] = df['tokenized_text'].apply(lambda x: [stemmer.stem(word) for word in x])



In [None]:
# Menggabungkan kembali teks yang telah di-stemming menjadi kalimat
df['processed_text'] = df['stemmed_text'].apply(lambda x: ' '.join(x))


In [None]:

# Feature Extraction dengan TF-IDF
corpus = df['processed_text'].tolist()
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)


In [None]:

# Membuat fitur tambahan
df['num_words'] = df['tokenized_text'].apply(lambda x: len(x))
df['avg_word_length'] = df['cleaned_text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))


In [None]:

# Menggabungkan fitur tambahan dengan matriks TF-IDF
feature_names = vectorizer.get_feature_names()
extra_features = ['num_words', 'avg_word_length']
feature_names.extend(extra_features)

X_extra = df[extra_features].values
X_combined = pd.concat([pd.DataFrame(X.toarray(), columns=feature_names), pd.DataFrame(X_extra, columns=extra_features)], axis=1)



In [None]:
# Menyiapkan data train dan test
y = df['label_column']  # Kolom label/target
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)



In [None]:
# Melatih model Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Melakukan prediksi pada data test
y_pred = nb_model.predict(X_test)

# Evaluasi performa model
accuracy = accuracy_score(y_test, y_pred)
print("Akurasi: ", accuracy)
