In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd
from posgres_conn import get_engine_from_settings
from textblob import TextBlob
from pyspark.ml.feature import Tokenizer
from googletrans import Translator
import time
import six
from google.cloud import translate_v2 as translate
import re
import emoji

In [14]:
engine = get_engine_from_settings()

#extract from postgresql
query = """SELECT review_id, content 
            FROM reviews r
            WHERE language='id'
        """
df = pd.read_sql_query(query, engine)

In [15]:
def remove_special_char(text):
    return re.sub(r"[^a-zA-Z0-9\s]","", text)

def remove_emoji(text):
    new_text = re.sub(emoji.get_emoji_regexp(), r"", text)
    return new_text

def remove_multiple_char(text):
    text_token = word_tokenize(text)

    words = []
    for word in text_token:
        if re.search(r"(.)\1{2,}", word):
            new_word = re.split(r"(.)\1{2,}", word)
            new_word = ''.join(new_word)
        else:
            new_word = word
        words.append(new_word)
    new_text = ' '.join(words)
    return new_text

def translator(text):
  new_text = GoogleTranslator(source='id', target='en').translate(text)
  return new_text

def sentiment_score(text):
    testimonial = TextBlob(text)
    score = testimonial.sentiment[0]
    return score

def cleaning(df):
    #case folding
    df['content'] = df['content'].str.lower()

    #remove special character
    df['content'] = df['content'].apply(remove_special_char)

    #remove emoji
    df['content'] = df['content'].apply(remove_emoji)

    #remove multiple character
    df['content'] = df['content'].apply(remove_multiple_char)

    #remove blank row
    df = df.replace(r'^\s*$', float("NaN"), regex=True)
    df = df.replace(r'^([0-9]*)$', float("NaN"), regex=True)
    df.dropna(inplace=True)

    return df

In [17]:
df

Unnamed: 0,review_id,content
0,AOqpTOEEvmhhiDg_knuxA7AsmfSt68YJedz-QOEDPuUupo...,koke
1,AOqpTOFNx1S0eNypUrqtIU6GGzz5cAuCNCHXegYgrnX8Te...,ok
2,AOqpTOEiNmTybyDX3F4pgCNP1ZacpUMPhFeJx_lCPdMCh6...,ok
3,AOqpTOG4Um8QgqXa13FnN_JjRpHfyNCCps-f56nNjEt0Wb...,hencet
4,AOqpTOEelhrdZPFt8N7EoNDWNQX11tiXFqQtyMtfhM__ZI...,ok
...,...,...
471153,AOqpTOEdvSYqEIkRlgi2dzdo4dyi0-ZexXgmdGDer3JcS5...,mantap
471154,AOqpTOHDctQaHJONgw9xc8KPqJWbo2nVzNcC2aviF0sBHc...,aku senang sekali dengan adanya ini aku bisa m...
471155,AOqpTOHI_hQcW4QwjCwcfvGUdSAZDXso-PyRcz54vvIHfJ...,good
471156,AOqpTOFZKR59WD04gUgg7V7iS-MgIxktC4DhuG2xqh5fy6...,i ask for an indonesian translation and i will...


In [16]:
df = cleaning(df)

TypeError: expected string or bytes-like object