In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# Handle Emoji
import nltk
import demoji
import unicodedata as uni
import re
import pandas as pd
import numpy as np

from textblob import TextBlob
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

from tqdm import tqdm

# Inisialisasi progress bar dengan jumlah total data
tqdm.pandas()

In [30]:
df = pd.read_csv('dataset/b.csv', encoding="latin-1")
reviews = df.Review

In [35]:
def preprocess(text):
    # Text normalization
    contractions = {
        "isn't": "is not",
        "aren't": "are not",
        "wasn't": "was not",
        "weren't": "were not",
        "haven't": "have not",
        "hasn't": "has not",
        "hadn't": "had not",
        "won't": "will not",
        "wouldn't": "would not",
        "don't": "do not",
        "doesn't": "does not",
        "didn't": "did not",
        "can't": "can not",
        "couldn't": "could not",
        "shouldn't": "should not",
        "mightn't": "might not",
        "mustn't": "must not",
        "I'm": "I am",
        "You're": "you are",
        "I've": "I have",
        "UI": "user interface",
        "UX": "user experience",
        "u": "you",
    }
    pattern = re.compile(r'\b(' + '|'.join(contractions.keys()) + r')\b')
    expanded_text = pattern.sub(lambda match: contractions[match.group(0)], text)

    normalized_text = uni.normalize('NFKD', expanded_text)
    normalized_text = ''.join([c for c in normalized_text if not uni.combining(c)])

    # emoji encoding
    emojis = demoji.findall(text)

    for emoji in emojis:
        text = text.replace(emoji, " " + emojis[emoji].split(":")[0])

    # text preprocessing
    teks = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    teks = teks.lower()
    stop_words = set(stopwords.words('english'))
    stop_words.update(['also', 'app', 'apps', 'application', 'applications', 'good'])
    stop_words.remove('not')
    tokens = word_tokenize(teks)
    filtered_tokens = [word for word in tokens if word.isalnum() and not any(char.isdigit() for char in word) and word not in stop_words]
    lemmatizer = nltk.WordNetLemmatizer()
    lemma = [lemmatizer.lemmatize(word) for word in filtered_tokens]
    clean_reviews = ' '.join(lemma)

    return clean_reviews

In [39]:
preprocess_rev = reviews.progress_apply(preprocess)
preprocess_rev = pd.DataFrame(preprocess_rev)

100%|██████████| 20491/20491 [04:03<00:00, 84.28it/s] 


In [40]:
# preprocess_rev.to_csv('dataset/preproces_rev.csv', index=False)

In [3]:
preprocess_rev

Unnamed: 0,Review
0,nice hotel expensive parking got deal stay hot...
1,ok nothing special charge diamond member hilto...
2,nice room not experience hotel monaco seattle ...
3,unique great stay wonderful time hotel monaco ...
4,great stay great stay went seahawk game awesom...
...,...
20486,best kept secret time staying charm not ca nt ...
20487,great location price view hotel great quick pl...
20488,ok look nice modern outside desk staff nt part...
20489,hotel theft ruined vacation hotel opened sept ...


In [4]:
# Sentiment Analysis using textblob
def analyze_sentiment(text):
    blob = TextBlob(text)
    sentiment_score = blob.sentiment.polarity
    # Sentiment Label
    if sentiment_score > 0:
        label = "Positive"
    else:
        label = "Negative"
    # elif sentiment_score < 0:
    #     label = "Negative"
    # else:
    #     label = "Neutral"
    return sentiment_score, label
# Apply sentiment analysis textblob
preprocess_rev[['Sentiment Score', 'Label']] = preprocess_rev['Review'].progress_apply(lambda x: pd.Series(analyze_sentiment(x)))
preprocess_rev

100%|██████████| 20491/20491 [00:29<00:00, 693.04it/s] 


Unnamed: 0,Review,Sentiment Score,Label
0,nice hotel expensive parking got deal stay hot...,0.182888,Positive
1,ok nothing special charge diamond member hilto...,0.166198,Positive
2,nice room not experience hotel monaco seattle ...,0.281336,Positive
3,unique great stay wonderful time hotel monaco ...,0.504825,Positive
4,great stay great stay went seahawk game awesom...,0.377778,Positive
...,...,...,...
20486,best kept secret time staying charm not ca nt ...,0.232287,Positive
20487,great location price view hotel great quick pl...,0.503704,Positive
20488,ok look nice modern outside desk staff nt part...,0.130544,Positive
20489,hotel theft ruined vacation hotel opened sept ...,0.118633,Positive


In [5]:
# Inisialisasi TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=1000)

# Vektorisasi teks latih dan uji
X = tfidf_vectorizer.fit_transform(preprocess_rev['Review'])
y = preprocess_rev['Label']
# Membagi data menjadi data latih, validasi, dan pengujian
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 80% train, 20% test
# Latih model SVM
svm_classifier = SVC(kernel='linear')

In [9]:
# Validasi silang dengan 5 lipatan (k-fold cross-validation)
cv_scores = cross_val_score(svm_classifier, X_train, y_train, cv=5)
print("Cross-validation scores:", cv_scores)

In [None]:
svm_classifier.fit(X_train, y_train)
# Menguji model pada data pengujian
y_pred = svm_classifier.predict(X_test)

In [14]:
test_accuracy = classification_report(y_test, y_pred)
print("Accuracy on test set:", test_accuracy)

Accuracy on test set:               precision    recall  f1-score   support

    Negative       0.80      0.53      0.64       317
    Positive       0.96      0.99      0.98      3782

    accuracy                           0.95      4099
   macro avg       0.88      0.76      0.81      4099
weighted avg       0.95      0.95      0.95      4099



In [22]:
import ray
ray.init()

2024-05-30 07:47:53,735	INFO worker.py:1749 -- Started a local Ray instance.


0,1
Python version:,3.11.9
Ray version:,2.20.0


In [23]:
@ray.remote
def train_and_predict(X_train, y_train, X_test):
    svm_classifier = SVC(kernel='linear')
    svm_classifier.fit(X_train, y_train)
    y_pred = svm_classifier.predict(X_test)
    return y_pred

In [29]:
import ray
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

# Inisialisasi Ray
ray.init()

# Fungsi untuk pelatihan dan prediksi
@ray.remote
def train_and_predict(X_train, y_train, X_test):
    svm_classifier = SVC(kernel='linear')
    svm_classifier.fit(X_train, y_train)
    y_pred = svm_classifier.predict(X_test)
    return y_pred

# Jalankan pelatihan dan prediksi secara paralel
future = train_and_predict.remote(X_train, y_train, X_test)

# Ambil hasilnya
y_pred = ray.get(future)

# Hitung akurasi
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Tutup Ray
ray.shutdown()


2024-05-30 07:50:13,442	INFO worker.py:1749 -- Started a local Ray instance.


Accuracy: 0.953891192973896
