In [3]:
import re

import nltk
from nltk.tokenize.treebank import TreebankWordDetokenizer
from nltk.corpus import stopwords

import gensim
from gensim.utils import simple_preprocess

import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [None]:
#Please pip install pyspellchecker
#!pip install pyspellchecker

# Import dataset

In [9]:
df = pd.read_csv('./df_english_labelled.csv', encoding='latin1') 
df.head()

Unnamed: 0.1,Unnamed: 0,reviewId,userName,userImage,content,score,thumbsUpCount,reviewCreatedVersion,at,replyContent,repliedAt,is_english,label
0,1,3577f7a1-3394-4e77-813d-095a82cf8bcf,Kemar Richardson,https://play-lh.googleusercontent.com/a-/ACNPE...,Great,5,0,26.3.4,10/4/2022 20:32,,,True,POSITIVE
1,2,7c8c56d9-d8ad-47d4-b24b-5289aa4529ff,Tracy Dunn,https://play-lh.googleusercontent.com/a/ALm5wu...,good,5,0,26.4.3,10/4/2022 20:31,,,True,POSITIVE
2,3,80db804f-cccd-4b09-b690-abc12cbf0612,SG. Mugo. (Mugoz:),https://play-lh.googleusercontent.com/a-/ACNPE...,Good app,5,0,26.3.4,10/4/2022 20:30,,,True,POSITIVE
3,4,4ed35e90-0f45-4865-81c4-b3a6f2ea49f7,Mwansa Judy,https://play-lh.googleusercontent.com/a-/ACNPE...,Most amazing app,5,0,26.3.4,10/4/2022 20:29,,,True,POSITIVE
4,5,bd35bbe9-73c9-4e17-acaf-7aa1a71caed3,Muhammad Sajid,https://play-lh.googleusercontent.com/a-/ACNPE...,full Entertainment,4,0,25.9.4,10/4/2022 20:28,,,True,POSITIVE


In [10]:
df=df.astype(str)

# Clean dataset

In [4]:
def depure_data(data):
    
    #remove url with a RegEx
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    data = url_pattern.sub(r'', data)

    #remove email address
    data = re.sub('\S*@\S*\s?', '', data)

    #remove new line characters
    data = re.sub('\s+', ' ', data)

    #remove single quotes
    data = re.sub("\'", "", data)
        
    return data


def detokenize(text):
    #treebank tokenizer uses RegEx to tokenize text as in Penn Treebank
    return TreebankWordDetokenizer().detokenize(text)


def sent_to_words(sentences):
    for sentence in sentences:
        #remove punctuations from sentences
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))
        
def correct_word_spelling(word):
    word = Word(word)
    result = word.correct()
    return result

In [11]:
train = df[['content','label']]
train.head()

Unnamed: 0,content,label
0,Great,POSITIVE
1,good,POSITIVE
2,Good app,POSITIVE
3,Most amazing app,POSITIVE
4,full Entertainment,POSITIVE


### Remove new lines & single quotes

In [12]:
temp = []
#split review content sentences to list
data_to_list = train['content'].values.tolist()
for i in range(len(data_to_list)):
    temp.append(depure_data(data_to_list[i]))
list(temp[:5])

['Great', 'good', 'Good app', 'Most amazing app', 'full Entertainment']

### Remove punctuation & accent mark

In [13]:
#tokenize sentences
data_words = list(sent_to_words(temp))

print(data_words[:10])
print(len(data_words))

99829


# Spelling Corrector

In [100]:
from textblob import TextBlob, Word
from spellchecker import SpellChecker
import re

spell = SpellChecker()

for i in range(len(data_words)):
    sentence = data_words[i]    
    for j in range(len(sentence)):
        word = sentence[j]
        
        #use regex to reduce 3 consecutive letters to 2
        reduced_word = re.sub(r'(.)\1+', r'\1\1', word)
        sentence[j] = reduced_word
        
        #if incorrect spelled, correct word
        if sentence[j]!= "tiktok" and len(spell.unknown([sentence[j]]))>0:
            sentence[j] = correct_word_spelling(sentence[j])

### Combine each sentence list to a big list

In [101]:
data = []
for i in range(len(data_words)):
    data.append(detokenize(data_words[i]))
print(data[:10])



## Small corpus to test spelling autocorrection function

In [14]:
from textblob import TextBlob, Word
from spellchecker import SpellChecker
import re

spell = SpellChecker()

text = [['great'], 
        ['gooood'], 
        ['good', 'appp'], 
        ['most', 'emazing', 'app'], 
        ['full', 'enterteinment'], 
        ['nicee', 'app'], 
        ['superbbb', 'and', 'good', 'app'], 
        ['tiktok', 'warnin', 'and', 'video', 'views', 'problem', 'in', 'my', 'id', 'manimeraj', 'server', 'nepal', 
        'tiktok', 'down'], 
        ['tiktok', 'is', 'nicee', 'app'], 
        ['need', 'my', 'tiktok', 'open', 'note']]

for i in range(len(text)):
    sentence = text[i]    
    #use regex to reduce 3 consecutive letters to 2
    for j in range(len(sentence)):
        word = sentence[j]
        reduced_word = re.sub(r'(.)\1+', r'\1\1', word)
        sentence[j] = reduced_word
        if sentence[j]!= "tiktok" and len(spell.unknown([sentence[j]]))>0:
            sentence[j] = correct_word_spelling(sentence[j])
    print(sentence)

['great']
['good']
['good', 'app']
['most', 'amazing', 'app']
['full', 'entertainment']
['nice', 'app']
['superb', 'and', 'good', 'app']
['tiktok', 'is', 'nice', 'app']
['need', 'my', 'tiktok', 'open', 'note']


In [15]:
data = []
for i in range(len(text)):
    data.append(detokenize(text[i]))
print(data[:10])

