In [4]:
import re
import pandas as pd
import csv
import nltk
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import string

import warnings
warnings.filterwarnings("ignore")

nltk.download('stopwords')


[nltk_data] Downloading package stopwords to C:\Users\HP
[nltk_data]     PAVILION\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
data = pd.read_csv('dataset/belum.csv', delimiter=';')


In [6]:
def cleaning(text):
    text = text.replace('-ness', '').replace('-jualness', '')
    text = re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", text)
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub(r'/n', ' ', text)
    text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', ' ', text)
    text = re.sub(r' +', ' ', text)
    text = re.sub(r'[0-9]+', '', text)
    text = re.sub(r'(?<!\bunnes)(\w)(\1+)(?=\s|[\.,!])', r'\1', text)
    text = text.strip(' ')
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    text = text.lower()  
    return text

data['text_processed'] = data['word'].apply(cleaning)
data

Unnamed: 0,word,weight,text_processed
0,putus tali gantung,-2,putus tali gantung
1,gelebah,-2,gelebah
2,gobar hati,-2,gobar hati
3,tersentuh (perasaan),-1,tersentuh perasaan
4,isak,-5,isak
...,...,...,...
10213,melarikan,3,melarikan
10214,vakansi,3,vakansi
10215,spesial,4,spesial
10216,asrama,3,asrama


In [7]:
def replace_word_elongation(text):
    elongated_words = re.findall(r'\b\w*(?:(\w)\1{2,})\w*\b', text)
    
    for word in elongated_words:
        replacement = word[0]
        text = re.sub(r'\b' + re.escape(word) + r'\b', replacement, text)
    
    return text

data['text_processed'] = data['text_processed'].apply(replace_word_elongation)
data

Unnamed: 0,word,weight,text_processed
0,putus tali gantung,-2,putus tali gantung
1,gelebah,-2,gelebah
2,gobar hati,-2,gobar hati
3,tersentuh (perasaan),-1,tersentuh perasaan
4,isak,-5,isak
...,...,...,...
10213,melarikan,3,melarikan
10214,vakansi,3,vakansi
10215,spesial,4,spesial
10216,asrama,3,asrama


In [8]:
def tokenize(text):
    text = word_tokenize(text)
    return text

data['text_processed'] = data['text_processed'].apply(tokenize)
data

Unnamed: 0,word,weight,text_processed
0,putus tali gantung,-2,"[putus, tali, gantung]"
1,gelebah,-2,[gelebah]
2,gobar hati,-2,"[gobar, hati]"
3,tersentuh (perasaan),-1,"[tersentuh, perasaan]"
4,isak,-5,[isak]
...,...,...,...
10213,melarikan,3,[melarikan]
10214,vakansi,3,[vakansi]
10215,spesial,4,[spesial]
10216,asrama,3,[asrama]


In [9]:
slang_df = pd.read_csv('kamus-singkatan.csv', delimiter=';', names=['singkatan', 'Full Word'])

# Define the translate_slang_list function
def translate_slang_list(text_list, slang_df):
    translated_list = []
    for text in text_list:
        words = text.split()
        translated_words = []
        for word in words:
            if word in slang_df['singkatan'].tolist():
                translated_words.append(slang_df[slang_df['singkatan'] == word]['Full Word'].values[0])
            else:
                translated_words.append(word)
        translated_list.append(' '.join(translated_words))
    return translated_list

# Apply the translate_slang_list function to the text_processed column in the data DataFrame
data['text_processed'] = data['text_processed'].apply(lambda x: translate_slang_list(x, slang_df))
data

Unnamed: 0,word,weight,text_processed
0,putus tali gantung,-2,"[putus, tali, gantung]"
1,gelebah,-2,[gelebah]
2,gobar hati,-2,"[gobar, hati]"
3,tersentuh (perasaan),-1,"[tersentuh, perasaan]"
4,isak,-5,[isak]
...,...,...,...
10213,melarikan,3,[melarikan]
10214,vakansi,3,[vakansi]
10215,spesial,4,[spesial]
10216,asrama,3,[asrama]


In [10]:
df_stopwords = pd.read_csv('stopwords-id.csv', header=None, names=['stopword'])

def remove_stopwords(text):
    if isinstance(text, list):
        filtered_words = [word for word in text if word.lower() not in df_stopwords['stopword'].str.lower().values]
        return filtered_words
    else:
        return text

data['text_processed'] = data['text_processed'].apply(remove_stopwords)
data

Unnamed: 0,word,weight,text_processed
0,putus tali gantung,-2,"[putus, tali, gantung]"
1,gelebah,-2,[gelebah]
2,gobar hati,-2,"[gobar, hati]"
3,tersentuh (perasaan),-1,"[tersentuh, perasaan]"
4,isak,-5,[isak]
...,...,...,...
10213,melarikan,3,[melarikan]
10214,vakansi,3,[vakansi]
10215,spesial,4,[spesial]
10216,asrama,3,[asrama]


In [11]:
stemmer = StemmerFactory().create_stemmer()


In [12]:
def lemmatization(tokens):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    lemmatized_tokens = [stemmer.stem(token) for token in tokens]
    lemmatized_text = ' '.join(lemmatized_tokens)
    return lemmatized_text

data['kata'] = data['text_processed'].apply(lemmatization)
data


Unnamed: 0,word,weight,text_processed,kata
0,putus tali gantung,-2,"[putus, tali, gantung]",putus tali gantung
1,gelebah,-2,[gelebah],gelebah
2,gobar hati,-2,"[gobar, hati]",gobar hati
3,tersentuh (perasaan),-1,"[tersentuh, perasaan]",sentuh asa
4,isak,-5,[isak],isak
...,...,...,...,...
10213,melarikan,3,[melarikan],lari
10214,vakansi,3,[vakansi],vakansi
10215,spesial,4,[spesial],spesial
10216,asrama,3,[asrama],asrama


In [13]:
# Simpan kolom 'kata' ke dalam file CSV dengan format UTF-8
data['kata'].to_csv('nama_file.csv', encoding='utf-8', index=False)


In [14]:
# lexicon_combined = pd.read_csv('lexicon.csv')

# def analisis_sentimen(text, lexicon):
#     words = text.lower().split()
#     score = 0
#     for word in words:
#         if word in lexicon['word'].values:
#             weight = lexicon.loc[lexicon['word'] == word, 'weight'].values[0]
#             score += weight
#     if score > 0:
#         return 'positif'
#     elif score < 0:
#         return 'negatif'
#     else:
#         return 'netral'


# lexicon_combined = pd.read_csv('lexicon.csv')
# corpus = pd.read_csv('corpus.csv')

# def analisis_sentimen(text, lexicon):
#     words = text.lower().split()
#     score = 0
#     for word in words:
#         if word in lexicon['word'].values:
#             weight = lexicon.loc[lexicon['word'] == word, 'weight'].values[0]
#             score += weight

#     if score > 0:
#         return 'positif'
#     elif score < 0:
#         service_words = corpus['kata'].values
#         for service_word in service_words:
#             if service_word in text:
#                 return 'negatif'
#         return 'netral'
#     else:
#         return 'netral'

# data['sentimen'] = data['text_processed'].apply(lambda x: analisis_sentimen(x, lexicon_combined))
# data

In [15]:
# selected_columns = data[['full_text', 'sentimen']]
# selected_columns.to_csv('hasiltanpacorpus.csv', index=False, sep=';', encoding='utf-8')

# selected_columns = data[data['sentimen'] == 'negatif'][['full_text', 'sentimen']]
# selected_columns.to_csv('dataset-negatif.csv', index=False, sep=';', encoding='utf-8')


In [16]:
# negative_words = []
# with open('corpus-negatif.csv', 'r', encoding='utf-8') as file:
#     csv_reader = csv.reader(file)
#     for row in csv_reader:
#         negative_words.append(row[0])

# # Create a function to check if any negative word is in the text
# def label_negative(text):
#     for word in text:
#         if word in negative_words:
#             return 'negatif'
#     return 'positif'

# # Apply the labeling function to your dataset
# data['label'] = data['text_processed'].apply(label_negative)
# data

In [17]:
# new_text = ["pelayanan lambat bgt"]
# new_text_tfidf = tfidf_vectorizer.transform(new_text)

# predicted_sentiment = model.predict(new_text_tfidf)

# predicted_sentiment_label = label_encoder.inverse_transform(predicted_sentiment)

# print("Prediksi:", predicted_sentiment_label[0])
