In [76]:
import re

def apply_rules(sentence, min_char):
    
    if not other_languages_rule(sentence):
        return  "REJECT"
    
    sentence = filler_word_rule(sentence)
    sentence = singlish_filler_word_rule(sentence)
    sentence = singlish_exclamation_rule(sentence)
    sentence = singlish_lingo_rule(sentence)
    sentence = acronyms_rule(sentence)
    sentence = shortforms_rule(sentence)
    
    if minimum_char_rule(sentence, min_char):
        return sentence
    else:
        return "REJECT"
    

def filler_word_rule(sentence):
    '''
    For words like etc. (uh), (mm), (pbo)
    
    REMOVE THESE WORDS
    '''
    
    pattern = r'\((.*?)\)'
    sentence = re.sub(pattern, r'', sentence)
    
    return sentence

def singlish_filler_word_rule(sentence):
    '''
    For words like etc. [ah], [leh], [orh]
    
    REMOVE the []
    '''
    
    pattern = r'\[(.*?)\]'
    sentence = re.sub(pattern, r'\1', sentence)
    
    return sentence

def other_languages_rule(sentence):
    '''
    For words like etc. <mandarin>â€¦</mandarin>
    
    REMOVE WHOLE THING
    '''
    
    pass_rule = True
    
    pattern = r'\<(\w+)\>(.*?)\<\/\1\>'
    sentence = re.findall(pattern, sentence)
    
    if sentence:
        pass_rule = False
    
    return pass_rule
    

def singlish_exclamation_rule(sentence): 
    '''
    For words like etc. !aiyo!, !haiya!, !huh!
    
    REMOVE the []
    '''
    
    pattern = r'\!(.*?)\!'
    sentence = re.sub(pattern, r'\1', sentence)
    
    return sentence

def singlish_lingo_rule(sentence):
    '''
    For words like etc. #kor#, #pa#, #Hougang#
    
    REMOVE the ##
    '''
    
    pattern = r'\#(.*?)\#'
    sentence = re.sub(pattern, r'\1', sentence)
    
    return sentence

def acronyms_rule(sentence):
    '''
    For words like etc. U_E
    
    REPLACE the _ with blank space
    '''
    
    pattern = r'_'
    sentence = re.sub(pattern, r' ', sentence)
    
    return sentence

def shortforms_rule(sentence):
    '''
    For words like etc. U_E
    
    REPLACE the _ with blank space
    '''
    
    pattern = r'~'
    sentence = re.sub(pattern, r'', sentence)
    return sentence

def minimum_char_rule(sentence, minimum_char):
    """
    Minimum number of characters to meet if not remove
    """
    
    return len(sentence) >= minimum_char

In [77]:
sentence_filler = '(hmm) ya (uh huh)'

filler_word_rule(sentence_filler)

' ya '

In [78]:
sentence_singlish_filler = "there's there's many levels [lah] then the whole thing took like just took so long [lor] ya we die a lot of times"
singlish_filler_word_rule(sentence_singlish_filler)

"there's there's many levels lah then the whole thing took like just took so long lor ya we die a lot of times"

In [79]:
sentence_chinese = "ya ya it really it really I just see one minecraft zombie just somersaulting towards me actually an army of them then then because <mandarin>\u4ed6\u4eec\u4e00\u76f4\u8f6c:ta men yi zhi zhuan</mandarin> then end up like like <mandarin>\u5f88\u96be\u6253\u7684\u5230:hen nan da di dao</mandarin>"
    
other_languages_rule(sentence_singlish_filler)

True

In [80]:
sentence_acronym = "U_E A_N_E"
acronyms_rule(sentence_acronym)

'U E A N E'

In [81]:
sentence_shortform = "poly~ poly~ poly~ poly~"
shortforms_rule(sentence_shortform)

'poly poly poly poly'