<a href="https://colab.research.google.com/github/nikenaml/google-play-apps-reviews/blob/master/text_cleaning_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

%matplotlib inline
%config InlineBackend.figure_format='retina'

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 2000)

In [None]:
!gdown --id 1rVa7buu3F_Vtq1-c7u9Qauqc679jlcfj
!gdown --id 1tMXP9mBk98PCmI0ltmHzNHPrdr46TJkZ
!gdown --id 1qJCIoc3RaP4qB_5gODor_VdK48qI9Ijo

In [None]:
df = pd.read_csv('apps_review-trialerror.csv')
df.tail(10)

In [None]:
df.shape

In [None]:
df.info()

In [None]:
sns.countplot(df.score)
plt.xlabel('review score')

In [None]:
def to_sentiment(rating):
    rating = int(rating)
    if rating <=2:
        return 0
    elif rating == 3:
        return 1
    else: 
        return 2

In [None]:
df['sentiment'] = df.score.apply(to_sentiment)

In [None]:
class_names = ['negative','neutral','positive']

In [None]:
ax = sns.countplot(df.sentiment)
plt.xlabel('review sentiment')
ax.set_xticklabels(class_names);

In [None]:
df = df[['content', 'sentiment']]

In [None]:
df.head(20)

# Data Cleaning

In [None]:
!pip install emot
!pip install emoji
!pip install emoji --upgrade

In [None]:
import re
import string
import emoji

In [None]:
# emoticons removal
from emot.emo_unicode import UNICODE_EMO, EMOTICONS
def remove_emoticons(text):
    emoticon_pattern = re.compile(u'(' + u'|'.join(k for k in EMOTICONS) + u')')
    return emoticon_pattern.sub(r'', text)
    
df['text'] = df['content'].apply(remove_emoticons)

In [None]:
# emoji removal ✍ 🌷 📌 👈🏻 🖥
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002500-\U00002BEF"  # chinese char
                           u"\U00002702-\U000027B0"
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           u"\U0001f926-\U0001f937"
                           u"\U00010000-\U0010ffff"
                           u"\u2640-\u2642"
                           u"\u2600-\u2B55"
                           u"\u200d"
                           u"\u23cf"
                           u"\u23e9"
                           u"\u231a"
                           u"\ufe0f"  # dingbats
                           u"\u3030"
                               #mengonversi data input ke Unicode
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

df['text'] = df['text'].apply(remove_emoji)

In [None]:
df.head(20)

In [None]:
# Fungsi Pre-processing
def text_cleaning(text):
  text = text.lower()
  text = re.sub(r"@[^\s]+", " ", text)
  text = re.sub(r"http\S+", " ", text)
  text = re.sub(r"#\S+", " ", text)
  return text

def removePunctuation(strs):
    remove = string.punctuation
   # remove = remove.replace("#", "")
    pattern = r"[{}]".format(remove)
    h = re.sub(pattern, " ", strs)
    return h

def replace(strs):
    strs = strs.replace('\\t',' ').replace('\\n',' ').replace('\\u',' ').replace('\\',' ')
    strs = strs.replace('\n',' ')
    strs = strs.replace('\t','')
    strs = strs.replace('rt','')
    strs = strs.encode('ascii', 'replace').decode('ascii')
    return strs

def remove_non_ascii(text): 
    return ''.join(i for i in text if ord(i)<128) 

def remove_spasi(strs):
    strs = re.sub(" +", " ",strs)
    strs = strs.strip()
    strs = re.sub("\s+"," ", strs)
    strs = re.sub(r"\b[a-zA-Z]\b", " ", strs)
    return strs

def remove_single_char(strs):
    return re.sub(r"\b[a-zA-Z]\b", "", strs)

In [None]:
### Preprocessing
# data_reviews = df
data_reviews['text'] = [i.lower() for i in df.text] #lower case
data_reviews['text'] = [text_cleaning(i) for i in data_reviews.text]
data_reviews['text'] = [remove_non_ascii(i) for i in data_reviews.text]
data_reviews['text'] = [re.sub(r'http.*','',i) for i in data_reviews.text] #remove link
data_reviews['text'] = [removePunctuation(i) for i in data_reviews.text] #remove punc
data_reviews['text'] = [replace(j) for j in data_reviews.text] #remove \n \t rt
data_reviews['text'] = [remove_single_char(j) for j in data_reviews.text]
data_reviews['text'] = [remove_spasi(j) for j in data_reviews.text]

In [None]:
data_reviews.head(20)