In [50]:
import pandas as pd
from posgres_conn import get_engine_from_settings
import re
import emoji
from nltk import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.corpus import stopwords
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [51]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName("Sentiment Analysis") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

22/03/08 08:38:47 WARN Utils: Your hostname, MacBook-Air-Mufida.local resolves to a loopback address: 127.0.0.1; using 192.168.1.31 instead (on interface en0)
22/03/08 08:38:47 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/03/08 08:38:49 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [53]:
engine = get_engine_from_settings()

#extract from postgresql
query = """SELECT review_id, content, score
            FROM reviews r
            WHERE language='id'
        """
pdf = pd.read_sql_query(query, engine)

In [54]:
df = spark.createDataFrame(pdf)
df = df.repartition(24)
df.write.mode('overwrite').parquet('reviews/lang_id/')

22/03/08 08:39:57 WARN TaskSetManager: Stage 0 contains a task of very large size (8598 KiB). The maximum recommended task size is 1000 KiB.
22/03/08 08:40:04 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
22/03/08 08:40:04 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
22/03/08 08:40:06 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
22/03/08 08:40:07 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 96.54% for 7 writers
22/03/08 08:40:07 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of heap memory
Scaling row group sizes to 84.47% for 8 writers
22/03/08 08:40:07 WARN MemoryManager: Total allocation exceeds 95.00% (906,992,014 bytes) of h

In [55]:
def remove_special_char(text):
    new_text = re.sub(r"[^a-zA-Z\s]","", text)
    new_text = re.sub(emoji.get_emoji_regexp(), r"", new_text)
    new_text = re.sub(r'^\s*$', "", new_text)
    new_text = re.sub(r'^([0-9]*)$', "", new_text)
    return new_text

def remove_multiple_char(text):
    text_token = word_tokenize(text)

    words = []
    for word in text_token:
        if re.search(r"(.)\1{2,}", word):
            new_word = re.split(r"(.)\1{2,}", word)
            new_word = ''.join(new_word)
        else:
            new_word = word
        words.append(new_word)
    new_text = ' '.join(words)
    return new_text

def cleaning(df):
    #case folding
    df['content'] = df['content'].str.lower()

    #remove special character
    df['content'] = df['content'].apply(remove_special_char)

    #remove multiple character
    df['content'] = df['content'].apply(remove_multiple_char)

    #remove blank row
    df.dropna(inplace=True)

    return df

def stemming(stemmer, words):
    # stemming process
    stem_words = [stemmer.stem(word) for word in words]
    return stem_words

def filter_stopwords(words):
    stop_words = set(stopwords.words('indonesian'))
    filtered_words = [word for word in words if word not in stop_words]
    return filtered_words


In [56]:
df = df.withColumn('sentiment_label', when(df.score <= 3,"Negative")
                                        .otherwise("Positive"))

In [59]:
df = df.na.drop(subset=["content"])
df = df.withColumn("content", lower(col("content"))) \
        .withColumn("content", regexp_replace("content", r"[^a-zA-Z\s]", "")) \
        .withColumn("content", regexp_replace("content", r'^\s*$', "")) \
        .withColumn("content", regexp_replace("content", r'^([0-9]*)$', ""))

In [60]:
df.show()

22/03/08 08:50:41 WARN TaskSetManager: Stage 6 contains a task of very large size (8598 KiB). The maximum recommended task size is 1000 KiB.
[Stage 6:>                                                          (0 + 8) / 8]

+--------------------+--------------------+-----+---------------+
|           review_id|             content|score|sentiment_label|
+--------------------+--------------------+-----+---------------+
|AOqpTOEdQu5ZIW0W-...|                bgus|    5|       Positive|
|AOqpTOEo9ZINuFZul...|apk nya bagus ban...|    5|       Positive|
|AOqpTOF3WQQgiYAH3...|                  ok|    5|       Positive|
|AOqpTOE-4nLRpnepl...|                  ok|    5|       Positive|
|AOqpTOGnNvjdUW-vQ...|tolong diplomasi ...|    3|       Negative|
|AOqpTOFw_b_KXzZHT...|       aku akan film|    5|       Positive|
|AOqpTOH64h0C8rLwX...|aku kasih bintang...|    4|       Positive|
|AOqpTOEwBMQwXXSgf...|kalau bagus nanti...|    3|       Negative|
|AOqpTOEOHoDUGJFW4...|tiba gk bisa di b...|    1|       Negative|
|AOqpTOHNKDdvrliO9...|gk bisa d donwloa...|    1|       Negative|
|AOqpTOGhCUwKDfSCy...|aplikasi ga jelas...|    1|       Negative|
|AOqpTOHt-4_XhUqZq...|      aplikasi bagus|    5|       Positive|
|AOqpTOFvy

                                                                                

In [34]:
df['sentiment_label'] = pd.cut(df['score'], bins=[0, 3, 5], labels=['negative', 'positive'])

In [35]:
df[df['content'].isnull()]

Unnamed: 0,review_id,content,score,sentiment_label
112047,AOqpTOGvymhmgsy3dQgClxcdafp37NaCFsxsEePDDQhoxY...,,5,positive
112981,AOqpTOEwAbQtLs58TQzmjorZwvd_tMVrO0mGVvZzETLOvI...,,5,positive
251326,AOqpTOGxRWoq-DWg1z1fBrwxTIFS6K7PfbdMX4SqCco1kH...,,1,negative
262790,AOqpTOEfUDJkq2ra5QiwWuCpIICNN6RBQAUwJMPhLijOMf...,,5,positive
264809,AOqpTOH2ZHo3FTq-Mi7ypQV5KwbWRs5FB58ud8kjGTGifY...,,5,positive
273009,AOqpTOGPe3ogfAJDC8_ZPKnSjfrVpA3p1aDXYWRxDJlc38...,,1,negative
277192,AOqpTOFzf5AdoiPRceyDOO-mgG_qNIOZzr403iw0AOW5-a...,,2,negative
280031,AOqpTOFeluFkTEeEe6eaJC-uVwq04Qj8sm7aF0lN-zLHQ3...,,3,negative
284050,AOqpTOESK76gz5Vwmthdl2vSzeo5COSFBbzEEM4yu30hUz...,,5,positive


In [36]:
df.dropna(inplace=True)


In [37]:
df['content'] = df['content'].str.lower()
df['content'] = df['content'].str.replace(r"[^a-zA-Z\s]","")
df['content'] = df['content'].str.replace(emoji.get_emoji_regexp(), r"")
df['content'] = df['content'].str.replace(r'^\s*$', "")
df['content'] = df['content'].str.replace(r'^([0-9]*)$', "")

  df['content'] = df['content'].str.replace(r"[^a-zA-Z\s]","")
  df['content'] = df['content'].str.replace(r'^\s*$', "")
  df['content'] = df['content'].str.replace(r'^([0-9]*)$', "")


In [38]:
df

Unnamed: 0,review_id,content,score,sentiment_label
0,AOqpTOF71nVO9Y_vub3RBq-zMdOQ4nbhCanCrv26sERHQX...,ok bangettt,5,positive
1,AOqpTOF4Z7XTt2JkuwdLuM2JHhAYvqR13uab0LncdZMIBk...,susah banget daftar vip nya tolong dong admin ...,1,negative
2,AOqpTOHuiIw15rBr1w2BC_uPaaETR2e-HOpm1eJh4rvq8y...,aplikasi paling buruk jangan di download pulsa...,1,negative
3,AOqpTOH_pL0SFUs-x-1qrEVSh_M7ItAXzNd649NpGa8t7X...,gout,5,positive
4,AOqpTOGumLn4cjSZEC6GN9lE0tMaGnVAh7sO9_o-O1l1FZ...,aku menyukai film wetv,5,positive
...,...,...,...,...
471153,b81a2a4e9ba311eca8221eb0cae22a66,kode otp kode otpnya tu manaaaaaaaaaa lama baa...,1,negative
471154,b81a2a9e9ba311eca8221eb0cae22a66,learn from competitors we need dark mode,3,negative
471155,b81a2aee9ba311eca8221eb0cae22a66,chromecast chromecast,1,negative
471156,b81a2b3e9ba311eca8221eb0cae22a66,george lucasd be crying somewhere great conten...,3,negative


In [39]:
df['words'] = df['content'].apply(word_tokenize)
df

Unnamed: 0,review_id,content,score,sentiment_label,words
0,AOqpTOF71nVO9Y_vub3RBq-zMdOQ4nbhCanCrv26sERHQX...,ok bangettt,5,positive,"[ok, bangettt]"
1,AOqpTOF4Z7XTt2JkuwdLuM2JHhAYvqR13uab0LncdZMIBk...,susah banget daftar vip nya tolong dong admin ...,1,negative,"[susah, banget, daftar, vip, nya, tolong, dong..."
2,AOqpTOHuiIw15rBr1w2BC_uPaaETR2e-HOpm1eJh4rvq8y...,aplikasi paling buruk jangan di download pulsa...,1,negative,"[aplikasi, paling, buruk, jangan, di, download..."
3,AOqpTOH_pL0SFUs-x-1qrEVSh_M7ItAXzNd649NpGa8t7X...,gout,5,positive,[gout]
4,AOqpTOGumLn4cjSZEC6GN9lE0tMaGnVAh7sO9_o-O1l1FZ...,aku menyukai film wetv,5,positive,"[aku, menyukai, film, wetv]"
...,...,...,...,...,...
471153,b81a2a4e9ba311eca8221eb0cae22a66,kode otp kode otpnya tu manaaaaaaaaaa lama baa...,1,negative,"[kode, otp, kode, otpnya, tu, manaaaaaaaaaa, l..."
471154,b81a2a9e9ba311eca8221eb0cae22a66,learn from competitors we need dark mode,3,negative,"[learn, from, competitors, we, need, dark, mode]"
471155,b81a2aee9ba311eca8221eb0cae22a66,chromecast chromecast,1,negative,"[chromecast, chromecast]"
471156,b81a2b3e9ba311eca8221eb0cae22a66,george lucasd be crying somewhere great conten...,3,negative,"[george, lucasd, be, crying, somewhere, great,..."


In [42]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

In [49]:
df['words'] = df['words'].apply(lambda x: stemming(stemmer, x))

KeyboardInterrupt: 

In [46]:
df

['ok', 'bangettt']