## Prepare our dataset

In [1]:
import os

# Be sure to change the path to absolute path of your directory if you re-run, or restart the kernel instead.
os.chdir("dataset/txt")
base = "prastyo-sentiment_posneg.txt"

# Open input file and read it line by line
input_stream = open(base, "r", encoding="utf8")
input_stream_lines = input_stream.readlines()
input_stream.close()

In [2]:
# Separate text and label from the input
text = []
for line in input_stream_lines:
    text.append(line.split("\t")[0])
    
label = []
for line in input_stream_lines:
    label.append(line.split("\t")[1])

In [3]:
# Check the size of input
print(len(input_stream_lines), len(text))

# Print the last sample
print("\ninput_stream_lines: \n", input_stream_lines[-1:],
      "\n\ntext: \n", text[-1:],
      "\n\nlabel: \n", label[-1:])

1918 1918

input_stream_lines: 
 ['Apapun agama dan kepercayaanmu, sblum tidur, yuk doakan mereka yg dalam prwatan COVID-19, nakes yg menangani, pemerintah dan warga Indonesia agar bersatu visi, bebas dari pandemi ini dalam waktu sesingkat-singkatnya.\tpos\n'] 

text: 
 ['Apapun agama dan kepercayaanmu, sblum tidur, yuk doakan mereka yg dalam prwatan COVID-19, nakes yg menangani, pemerintah dan warga Indonesia agar bersatu visi, bebas dari pandemi ini dalam waktu sesingkat-singkatnya.'] 

label: 
 ['pos\n']


In [4]:
# Creating output file
os.chdir("../../output")
output = os.path.splitext(base)[0]+'-clean.txt'

## Cleaning text data with **RegEx**

In [5]:
import re

count=0
with open(output, 'w') as f:
    for line in text:
        # Step-1: Non-ascii
        res = re.sub(r'[^\x00-\x7F]+',' ', line)
        # Step-2: URLs
        res = re.sub(r'http[s]?\:\/\/.[a-zA-Z0-9\.\/\_?=%&#\-\+!]+',' ', res)
        res = re.sub(r'pic.twitter.com?.[a-zA-Z0-9\.\/\_?=%&#\-\+!]+',' ', res)
        # Step-3: mentions
        res = re.sub(r'\@([\w]+)',' ', res)
        
        
        # !!! Choose One !!!
        # Step-4_alt-1: remove hashtags
        # res = re.sub(r'\#([\w]+)',' ', res)
        # Step-4_alt-2: retain hashtags (split string by capital letter)**
        res = re.sub(r'((?<=[a-z])[A-Z]|[A-Z](?=[a-z]))', ' \\1', res)
        #res = re.sub(r'([A-Z])(?<=[a-z]\1|[A-Za-z]\1(?=[a-z]))',' \\1', res)
        
        
        # Step-5: symbols
        res = re.sub(r'[!$%^&*@#()_+|~=`{}\[\]%\-:";\'<>?,.\/]', ' ', res)
        # Step-6: numbers
        res = re.sub(r'[0-9]+','', res)
        # Step-7: duplicate three consecutive character correction (eg. yukkk)
        res = re.sub(r'([a-zA-Z])\1\1','\\1', res)

        # Step-8: double/multiple spaces to single space
        res = re.sub(' +', ' ', res)
        # Step-9: space at the beginning and the end of a sentence
        res = re.sub(r'^[ ]|[ ]$','', res)
        
        # Step-10: lowercase
        res = res.lower()
        
        # Writing each replaced line to output file
        # and returning its label
        f.write(str(res+"\t"+label[count]))
        count+=1

## Note:
For Step-4, choose either removing or retaining hashtags
<blockquote> **ref: <i>https://stackoverflow.com/questions/1097901/regular-expression-split-string-by-capital-letter-but-ignore-tla</i></blockquote>