Langkah 1 - Load Data


In [3]:
import pandas as pd

data = pd.read_csv('/content/sample_data/spam.csv', encoding='latin1')
data.head()


Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


Langkah 2 - Preprocessing


Langkah 2a - Drop Kolom



In [6]:
# Drop 3 kolom terakhir dengan iloc
data = data.drop(data.iloc[:, 2:], axis=1)

# Cek data
data.head()


Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


Langkah 2b - Inspeksi Data



In [10]:
print(data.columns)


Index(['v1', 'v2'], dtype='object')


In [11]:
data = data.rename(columns={'v1':'Label', 'v2':'Text'})
data = data[['Label','Text']]   # keep only 2 columns


In [12]:
print(data['Label'].value_counts())
print('\n')

print(data.info())
print('\n')

print(data.describe())


Label
ham     4825
spam     747
Name: count, dtype: int64


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Label   5572 non-null   object
 1   Text    5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None


       Label                    Text
count   5572                    5572
unique     2                    5169
top      ham  Sorry, I'll call later
freq    4825                      30


Langkah 2c - Encoding Label



In [14]:
# Data untuk label
new_labels = {
    'spam': 1,
    'ham': 0
}

# Encode label
data['Label'] = data['Label'].map(new_labels)

# Cek data
data.head()


Unnamed: 0,Label,Text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


Langkah 2d - Pisahkan Fitur dengan Label



In [16]:
# Pisahkan fitur dan label
X = data['Text'].values
y = data['Label'].values

# Cek bentuk data
print("Jumlah sampel:", len(X))
print("Contoh fitur:", X[0])
print("Contoh label:", y[0])


Jumlah sampel: 5572
Contoh fitur: Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...
Contoh label: 0


Langkah 3 - Ekstraksi Fitur


In [17]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

# Split data training dan testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=50)

# Inisiasi CountVectorizer
bow = CountVectorizer()

# Fitting dan transform X_train dengan CountVectorizer
X_train = bow.fit_transform(X_train)

# Transform X_test
# Mengapa hanya transform? Alasan yang sama dengan kasus pada percobaan ke-3
# Kita tidak menginginkan model mengetahui paramter yang digunakan oleh CountVectorizer untuk fitting data X_train
# Sehingga, data testing dapat tetap menjadi data yang asing bagi model nantinya
X_test = bow.transform(X_test)

Langkah 4 - Training dan Evaluasi Model


In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Inisiasi MultinomialNB
mnb = MultinomialNB()

# Fit model
mnb.fit(X_train, y_train)

# Prediksi dengan data training
y_pred_train = mnb.predict(X_train)

# Evaluasi akurasi data training
acc_train = accuracy_score(y_train, y_pred_train)

# Prediksi dengan data training
y_pred_test = mnb.predict(X_test)

# Evaluasi akurasi data training
acc_test = accuracy_score(y_test, y_pred_test)

# Print hasil evaluasi
print(f'Hasil akurasi data train: {acc_train}')
print(f'Hasil akurasi data test: {acc_test}')

Hasil akurasi data train: 0.9946152120260264
Hasil akurasi data test: 0.9775784753363229
