# Mathematics Question Text Classification with Support Vector Machine (SVM)

## Load Dataset

In [None]:
#Load dataset from Google Drive
import pandas as pd
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
df = pd.read_csv('/content/drive/MyDrive/Dataset/Matematika-SD-SMP.csv')
df.head()

Mounted at /content/drive


Unnamed: 0,NO,id,question,choices,key,key_text,label_1,indikator,label_2
0,1,8e4cb4de-bf0c-49ba-87bb-fed51419d294,Seseorang mempunyai sejumlah uang yang akan di...,"A. Rp6.750.000,00\n B. Rp7.050.000,00\n C. Rp7...",B,"Rp7.050.000,00\n",C3,mengaitkan,sedang
1,2,80ed6b49-0e35-4677-978a-a551523d2aaf,Rumus jumlah n suku pertama deret aritmetika a...,A. 39\n ...,E,87,C3,menghitung,mudah
2,3,1fe60e13-2323-4d5a-bfab-a004a23abd88,Bakteri jenis A berkembang biak menjadi dua ka...,A. 640Â Â Â Â \n\t\t\t\t\t\t\t B. 6.400\n\t\...,B,6.400\n\t\t\t\t\t\t\t,C3,menghitung,sulit
3,4,a0235e0f-8a6b-42f8-95eb-d8be56a8517b,Seorang ayah membagikan uang sebesar Rp 100.00...,"A. Rp 15.000,00Â Â Â Â \n ...",C,"Rp 17.500,00\n\t\t\t\t\t\t\t",C3,menghitung,sulit
4,5,5604840d-4e0f-4194-90b4-e8e89161eca1,Suatu perusahaan pakaian dapat menghasilkan 4....,A. 45.500 Â Â Â Â \n\t\t\t\t\t\t\t B. 50.500Â...,E,51.300,C3,menghitung,sedang


## Preprocessing Data

In [None]:
import re
import string

#Removing character
def hapus_karakter_selain_huruf(text):
  for sp in string.punctuation:
    text = text.replace(sp, " ")
  text = re.sub(r"\d+","",text)    
  return text.replace('/\s\s+/g', ' ')

df['question'] = df['question'].apply(hapus_karakter_selain_huruf)
df.head()

Unnamed: 0,NO,id,question,choices,key,key_text,label_1,indikator,label_2
0,1,8e4cb4de-bf0c-49ba-87bb-fed51419d294,Seseorang mempunyai sejumlah uang yang akan di...,"A. Rp6.750.000,00\n B. Rp7.050.000,00\n C. Rp7...",B,"Rp7.050.000,00\n",C3,mengaitkan,sedang
1,2,80ed6b49-0e35-4677-978a-a551523d2aaf,Rumus jumlah n suku pertama deret aritmetika a...,A. 39\n ...,E,87,C3,menghitung,mudah
2,3,1fe60e13-2323-4d5a-bfab-a004a23abd88,Bakteri jenis A berkembang biak menjadi dua ka...,A. 640Â Â Â Â \n\t\t\t\t\t\t\t B. 6.400\n\t\...,B,6.400\n\t\t\t\t\t\t\t,C3,menghitung,sulit
3,4,a0235e0f-8a6b-42f8-95eb-d8be56a8517b,Seorang ayah membagikan uang sebesar Rp kep...,"A. Rp 15.000,00Â Â Â Â \n ...",C,"Rp 17.500,00\n\t\t\t\t\t\t\t",C3,menghitung,sulit
4,5,5604840d-4e0f-4194-90b4-e8e89161eca1,Suatu perusahaan pakaian dapat menghasilkan ...,A. 45.500 Â Â Â Â \n\t\t\t\t\t\t\t B. 50.500Â...,E,51.300,C3,menghitung,sedang


In [None]:
#Stopwords array
sw = ["aja", "agan", "barang", "beli", "baru", "ada",
      "adalah", "ajaa", "akan", "aku", "atas",
      "buat", "boss", "agak", "agar", "apa",
      "biar", "bsa", "alhmdulillah", "alhamdulilah", "akhir",
      "allah", "ane", "arn", "atau", "audah", 
      "bang", "aaaa", "about", "adik", "ah",
      "admin", "and", "as", "bakalan", "barank", "amin",
      "begini","begitu","beberapa"]

#Removing stopword
def stopwords(text):
  text = [word.lower() for word in text.split()] #casefolding
  for word in text:
    for stop in sw:
      if word==stop:
        text.remove(word)
  text = " ".join(text)
  text = re.sub(r'(.+?)\1+', r'\1',text)
  return text

df['question'] = df['question'].apply(stopwords)
df.head()

Unnamed: 0,NO,id,question,choices,key,key_text,label_1,indikator,label_2
0,1,8e4cb4de-bf0c-49ba-87bb-fed51419d294,seorang mempunyai sejumlah uang yang diambil t...,"A. Rp6.750.000,00\n B. Rp7.050.000,00\n C. Rp7...",B,"Rp7.050.000,00\n",C3,mengaitkan,sedang
1,2,80ed6b49-0e35-4677-978a-a551523d2aaf,rumus jumlah n suku pertama deret aritmetika s...,A. 39\n ...,E,87,C3,menghitung,mudah
2,3,1fe60e13-2323-4d5a-bfab-a004a23abd88,bakteri jenis a berkembang biak menjadi dua ka...,A. 640Â Â Â Â \n\t\t\t\t\t\t\t B. 6.400\n\t\...,B,6.400\n\t\t\t\t\t\t\t,C3,menghitung,sulit
3,4,a0235e0f-8a6b-42f8-95eb-d8be56a8517b,seorang ayah membagikan uang sebesar rp kepada...,"A. Rp 15.000,00Â Â Â Â \n ...",C,"Rp 17.500,00\n\t\t\t\t\t\t\t",C3,menghitung,sulit
4,5,5604840d-4e0f-4194-90b4-e8e89161eca1,suatu perusahan pakaian dapat menghasilkan bua...,A. 45.500 Â Â Â Â \n\t\t\t\t\t\t\t B. 50.500Â...,E,51.300,C3,menghitung,sedang


In [None]:
import sys
!{sys.executable} -m pip install Sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

#Build stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

#Stemming
def stemming(text):
  text = [stemmer.stem(word) for word in text.split()]
  return " ".join(text)
    
df['question'] = df['question'].apply(stemming)
df.head()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Unnamed: 0,NO,id,question,choices,key,key_text,label_1,indikator,label_2
0,1,8e4cb4de-bf0c-49ba-87bb-fed51419d294,orang punya jumlah uang yang ambil tiap bulan ...,"A. Rp6.750.000,00\n B. Rp7.050.000,00\n C. Rp7...",B,"Rp7.050.000,00\n",C3,mengaitkan,sedang
1,2,80ed6b49-0e35-4677-978a-a551523d2aaf,rumus jumlah n suku pertama deret aritmetika s...,A. 39\n ...,E,87,C3,menghitung,mudah
2,3,1fe60e13-2323-4d5a-bfab-a004a23abd88,bakteri jenis a kembang biak jadi dua kali lip...,A. 640Â Â Â Â \n\t\t\t\t\t\t\t B. 6.400\n\t\...,B,6.400\n\t\t\t\t\t\t\t,C3,menghitung,sulit
3,4,a0235e0f-8a6b-42f8-95eb-d8be56a8517b,orang ayah bagi uang besar rp kepada orang ana...,"A. Rp 15.000,00Â Â Â Â \n ...",C,"Rp 17.500,00\n\t\t\t\t\t\t\t",C3,menghitung,sulit
4,5,5604840d-4e0f-4194-90b4-e8e89161eca1,suatu usah pakai dapat hasil buah pada awal pr...,A. 45.500 Â Â Â Â \n\t\t\t\t\t\t\t B. 50.500Â...,E,51.300,C3,menghitung,sedang


In [None]:
#Removing unused columns
df_drop = df.drop(labels=['NO', 'id', 'choices', 'key', 'key_text', 'label_1', 'indikator'], axis=1)
df_drop.head()

Unnamed: 0,question,label_2
0,orang punya jumlah uang yang ambil tiap bulan ...,sedang
1,rumus jumlah n suku pertama deret aritmetika s...,mudah
2,bakteri jenis a kembang biak jadi dua kali lip...,sulit
3,orang ayah bagi uang besar rp kepada orang ana...,sulit
4,suatu usah pakai dapat hasil buah pada awal pr...,sedang


## Splitting Dataset

In [None]:
from sklearn import model_selection
train_x, test_x, train_y, test_y = model_selection.train_test_split(df['question'],df['label_2'],test_size=0.2, random_state=13)

## Feature Extractions

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer 

In [None]:
#TF-IDF
TFIDF_vect = TfidfVectorizer(max_features=5000)
TFIDF_vect.fit(df_drop['question'])
train_x_tfidf = TFIDF_vect.transform(train_x)
test_x_tfidf = TFIDF_vect.transform(test_x)
print(TFIDF_vect.vocabulary_)

{'orang': 441, 'punya': 504, 'jumlah': 257, 'uang': 657, 'yang': 684, 'ambil': 22, 'tiap': 633, 'bulan': 118, 'besar': 94, 'ikut': 226, 'atur': 41, 'baris': 65, 'aritmetika': 34, 'pada': 444, 'pertama': 479, 'rp': 525, 'dua': 168, 'tiga': 636, 'demikian': 149, 'terus': 629, 'seluruh': 563, 'telah': 614, 'lama': 337, 'rumus': 530, 'suku': 595, 'deret': 152, 'tujuh': 649, 'dari': 141, 'sebut': 547, 'bakteri': 57, 'jenis': 253, 'kembang': 292, 'biak': 96, 'jadi': 238, 'kali': 265, 'lipat': 360, 'lima': 355, 'menit': 399, 'waktu': 679, 'belas': 78, 'banyak': 64, 'puluh': 502, 'ayah': 44, 'bagi': 49, 'kepada': 299, 'anak': 24, 'makin': 376, 'muda': 416, 'usia': 674, 'kecil': 281, 'terima': 626, 'jika': 255, 'selisih': 562, 'oleh': 438, 'dekat': 146, 'dan': 139, 'si': 573, 'sulung': 596, 'paling': 453, 'maka': 374, 'bungsu': 122, 'suatu': 591, 'usah': 672, 'pakai': 450, 'dapat': 140, 'hasil': 210, 'buah': 113, 'awal': 43, 'produksi': 498, 'tingkat': 640, 'bila': 100, 'maju': 373, 'tetap': 63

In [None]:
#BoW
Count_vect = CountVectorizer(max_features=5000)
Count_vect.fit(df_drop['question'])
train_x_count = Count_vect.transform(train_x)
test_x_count = Count_vect.transform(test_x)
print(Count_vect.vocabulary_)

{'orang': 441, 'punya': 504, 'jumlah': 257, 'uang': 657, 'yang': 684, 'ambil': 22, 'tiap': 633, 'bulan': 118, 'besar': 94, 'ikut': 226, 'atur': 41, 'baris': 65, 'aritmetika': 34, 'pada': 444, 'pertama': 479, 'rp': 525, 'dua': 168, 'tiga': 636, 'demikian': 149, 'terus': 629, 'seluruh': 563, 'telah': 614, 'lama': 337, 'rumus': 530, 'suku': 595, 'deret': 152, 'tujuh': 649, 'dari': 141, 'sebut': 547, 'bakteri': 57, 'jenis': 253, 'kembang': 292, 'biak': 96, 'jadi': 238, 'kali': 265, 'lipat': 360, 'lima': 355, 'menit': 399, 'waktu': 679, 'belas': 78, 'banyak': 64, 'puluh': 502, 'ayah': 44, 'bagi': 49, 'kepada': 299, 'anak': 24, 'makin': 376, 'muda': 416, 'usia': 674, 'kecil': 281, 'terima': 626, 'jika': 255, 'selisih': 562, 'oleh': 438, 'dekat': 146, 'dan': 139, 'si': 573, 'sulung': 596, 'paling': 453, 'maka': 374, 'bungsu': 122, 'suatu': 591, 'usah': 672, 'pakai': 450, 'dapat': 140, 'hasil': 210, 'buah': 113, 'awal': 43, 'produksi': 498, 'tingkat': 640, 'bila': 100, 'maju': 373, 'tetap': 63

## SVM Model

In [None]:
from sklearn import naive_bayes
from sklearn.metrics import classification_report

In [None]:
# SVM dengan TF-IDF Vectorizer

MNB_TFIDF = naive_bayes.MultinomialNB()
MNB_TFIDF.fit(train_x_tfidf ,train_y)
predictions_MNB_TFIDF = MNB_TFIDF.predict(test_x_tfidf)
print(classification_report(test_y, predictions_MNB_TFIDF))

              precision    recall  f1-score   support

       mudah       0.88      0.33      0.48        21
      sedang       0.57      0.97      0.72        35
       sulit       0.00      0.00      0.00        12

    accuracy                           0.60        68
   macro avg       0.48      0.43      0.40        68
weighted avg       0.56      0.60      0.52        68



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# SVM dengan Count Vectorizer

MNB_BoW = naive_bayes.MultinomialNB()
MNB_BoW.fit(train_x_count,train_y)
predictions_MNB_BoW = MNB_BoW.predict(test_x_count)
print(classification_report(test_y, predictions_MNB_BoW))

              precision    recall  f1-score   support

       mudah       0.75      0.71      0.73        21
      sedang       0.71      0.83      0.76        35
       sulit       0.71      0.42      0.53        12

    accuracy                           0.72        68
   macro avg       0.72      0.65      0.67        68
weighted avg       0.72      0.72      0.71        68

