In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV

In [None]:
# Mengunduh sumber daya NLTK jika belum diunduh
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Membaca data set
data = pd.read_csv('/content/sample_data/mental_health_sentiment.csv')

In [None]:
# Menghapus baris duplikat pada kolom 'text'
data = data.drop_duplicates(subset=['text'])

In [None]:
# Menghapus baris dengan nilai null pada kolom 'text'
data = data.dropna()

In [None]:
# Fungsi untuk praproses teks
def preprocess_text(text):
    # Menghapus karakter non-alfanumerik
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Mengubah teks menjadi huruf kecil
    text = text.lower()

    # Tokenisasi teks
    tokens = nltk.word_tokenize(text)

    # Menghapus stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lematisasi
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    # Menggabungkan kembali token menjadi teks
    preprocessed_text = ' '.join(tokens)

    return preprocessed_text

In [None]:
# Melakukan praproses pada kolom 'text'
data['text'] = data['text'].apply(preprocess_text)

In [None]:
# Memisahkan data teks dan label
X = data['text']
y = data['label']

In [None]:
# Memiisahkan data menjadi data pelatihan dan pengujian
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Melakukan pembobotan kata (ekstraksi fitur) menggunakan TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [None]:
# Menangani ketidakseimbangan kelas menggunakan SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vectorized, y_train)


In [None]:
# Membangun model LinearSVC dengan penyetelan hyperparameter
param_grid = {'C': [0.1, 1, 10, 100]}
grid_search = GridSearchCV(LinearSVC(), param_grid, cv=5)
grid_search.fit(X_train_resampled, y_train_resampled)



In [None]:
# Melatih model terbaik yang ditemukan
best_model = grid_search.best_estimator_
best_model.fit(X_train_resampled, y_train_resampled)

In [None]:
# Mengevaluasi model
y_pred = best_model.predict(X_test_vectorized)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.29      0.09      0.13       126
           1       0.82      0.88      0.85      3174
           2       0.77      0.77      0.77      2116
           3       0.28      0.06      0.10       180

    accuracy                           0.80      5596
   macro avg       0.54      0.45      0.46      5596
weighted avg       0.77      0.80      0.78      5596



In [None]:
import joblib

# Menyimpan model dan vectorizer ke dalam file
joblib.dump(best_model, 'svm_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')