In [None]:
#Import necessary library
import pandas as pd
import os
import requests
import urllib
import xlrd
import re

In [None]:
# Append the directory in the scrapedData sub-folder
BASE = './scrapedData/'
TEMP = './tempData/'
FINAL = './finalData/'

## Combining the individual .xlsx files into one big .xlsx files

In [None]:
tempList = os.listdir('scrapedData')
# display individual .xlsx names in the ScrapedData sub-folder
fileList = list()
for file in tempList:
    fileList += [BASE + file]
    
fileList

['./scrapedData/scrapingData_ACJC-Confessions-365341810240423_FULL.xlsx',
 './scrapedData/scrapingData_andiechen_FULL.xlsx',
 './scrapedData/scrapingData_asrjcconfessions_FULL.xlsx',
 './scrapedData/scrapingData_bellywellyjelly_FULL.xlsx',
 './scrapedData/scrapingData_benjamin.kheng_FULL.xlsx',
 './scrapedData/scrapingData_bossyflossie_FULL.xlsx',
 './scrapedData/scrapingData_cjcroxx_FULL.xlsx',
 './scrapedData/scrapingData_DanielFoodDiary_FULL.xlsx',
 './scrapedData/scrapingData_DHS-Confessions-103690209814932_FULL.xlsx',
 './scrapedData/scrapingData_DollarsAndSenseSG_FULL.xlsx',
 './scrapedData/scrapingData_dreachongofficial_FULL.xlsx',
 './scrapedData/scrapingData_HwaChongConfessions_FULL.xlsx',
 './scrapedData/scrapingData_ieatishootipost_FULL.xlsx',
 './scrapedData/scrapingData_InnovaConfessions_FULL.xlsx',
 './scrapedData/scrapingData_ITE-College-Central-Confessions-102332676616681_FULL.xlsx',
 './scrapedData/scrapingData_ITE-College-West-Confessions-123845157793064_FULL.xlsx',
 

In [None]:
# Check the number of .xlsx files needed to combine
len(fileList)

45

In [None]:
# Combining excel files in bigger batches
def excelCombiner(idxLow,idxHigh):

    #Create a dataframe
    df = pd.DataFrame()

    # read them in
    excels = [pd.ExcelFile(item) for item in fileList[idxLow:idxHigh]]

    # turn them into dataframes
    frames = [x.parse(x.sheet_names[0], header=None,index_col=None) for x in excels]

    # delete the first row for all frames except the first
    # i.e. remove the header row -- assumes it's the first
    frames[1:] = [df[1:] for df in frames[1:]]

    # concatenate them..
    combined = pd.concat(frames)
    
    return combined

## Save the output of the combined dataset into bigger batches of excel

In [None]:
# Batch 1
excelBatch1 = excelCombiner(0,34)
excelBatch1.to_excel(TEMP+"excelBatch1.xlsx", header=False, index=False)

In [None]:
# Batch 2
excelBatch2 = excelCombiner(34,len(fileList)+1)
excelBatch2.to_excel(TEMP+"excelBatch2.xlsx", header=False, index=False)

## Cleaning of Data

In [None]:
# details of individual batches of .xlsx
def xlsxBatchInfo(subFolder,xlsxBatchFile):
    file = subFolder + xlsxBatchFile
    workbook = xlrd.open_workbook(file)
    sheet = workbook.sheet_by_index(0)
    # check file directory
    #print(file)
    # check number of rows in the scraped text corpus in this batch
    #print(sheet.nrows)
    return file, sheet.nrows

In [None]:
# call the details of batch1 excel file
batch1, noOfRows1 = xlsxBatchInfo(TEMP,"excelBatch1.xlsx")
print(batch1)
print(noOfRows1)

./tempData/excelBatch1.xlsx
64003


In [None]:
# call the details of batch2 excel file
batch2, noOfRows2 = xlsxBatchInfo(TEMP,"excelBatch2.xlsx")
print(batch2)
print(noOfRows2)

./tempData/excelBatch2.xlsx
18923


In [None]:
# Function to clean the .xlsx data and export as .txt files by batches 
# because printing the text log has a limit in jupyter notebook
def cleanedData(file):
    #Initialize some list
    someList = list()

    #Script to open data
    workbook = xlrd.open_workbook(file)
    sheet = workbook.sheet_by_index(0)
    for x in range(sheet.nrows):
        item = sheet.cell_value(x, 0)
        print()
        
        #Removed chinese characters and emojis
        cleaned_item = str(item).encode("ascii", errors = "ignore").decode()
        
        #Code to clean for #hastags
        p = re.compile('#(\S+)?')
        m = p.findall(cleaned_item)
        if m:
            cleaned_item = re.sub('#(\S+)?','',cleaned_item)
            
        #Code to clean for websites www
        regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
        m2 = re.findall(regex,cleaned_item)
        if m2:
            cleaned_item = re.sub(regex,'',cleaned_item)
            
        #Code to clean for tags [  ]
        p = re.compile('\[.*?(\S*)?\]')
        m = p.findall(cleaned_item)
        if m:
            cleaned_item = re.sub('\[.*?(\S*)?\]','',cleaned_item)
            
        #Code to clean for angular brackets < >
        p = re.compile('\<.*?(\S*)?\>')
        m = p.findall(cleaned_item)
        if m:
            cleaned_item = re.sub('\<.*?(\S*)?\>','',cleaned_item)
            
        #Code to replace \n with fullstop
        p = re.compile('\\n')
        m = p.findall(cleaned_item)
        if m:
            cleaned_item = re.sub('\\n','.',cleaned_item)
            
        #Code to replace date-time
        p = re.compile('\b[0-9]*(-|\/|\.|:)[0-9]*((-|\/|\.|:)[0-9]*)?\b')
        m = p.findall(cleaned_item)
        if m:
            cleaned_item = re.sub('\b[0-9]*(-|\/|\.|:)[0-9]*((-|\/|\.|:)[0-9]*)?\b','',cleaned_item)
            
        #Code to replace month
        p = re.compile('([0-9]*(st|nd|rd|th)?\s)?(January|Jan|Feburary|Feb|March|Mar|April|Apr|May|June|Jun|July|Jul|August|Aug|September|Sep|October|Oct|November|Nov|December|Dec)(\s[0-9]*)?')
        m = p.findall(cleaned_item)
        if m:
            cleaned_item = re.sub('([0-9]*(st|nd|rd|th)?\s)?(January|Jan|Feburary|Feb|March|Mar|April|Apr|May|June|Jun|July|Jul|August|Aug|September|Sep|October|Oct|November|Nov|December|Dec)(\s[0-9]*)?','',cleaned_item)
        
        #Code to remove parantheses ()
        p = re.compile('\(.*?(\S*)?\)')
        m = p.findall(cleaned_item)
        if m:
            cleaned_item = re.sub('\(.*?(\S*)?\)','',cleaned_item)

        #Code to remove mentions @
        p = re.compile('@(\S+)?')
        m = p.findall(cleaned_item)
        if m:
            cleaned_item = re.sub('@(\S+)?','',cleaned_item)

        #Code to remove equal signs =
        regex = r"\=+"
        m = re.findall(regex,cleaned_item)
        if m:
            cleaned_item = re.sub(regex,'',cleaned_item)

        #Code to replace trailing dots with a single dot
        regex = r"\.\.+"
        m = re.findall(regex,cleaned_item)
        if m:
            #print("Found trailing dots... Replacing with one.")
            cleaned_item = re.sub(regex,'.',cleaned_item)

        #Code to replace forward slash / with a space
        regex = r"\/+"
        m = re.findall(regex,cleaned_item)
        if m:
            #print("Found forward slash / , Replacing with a space .")
            cleaned_item = re.sub(regex,' ',cleaned_item)

        #Code to remove underscores _
        regex = r"\_+"
        m = re.findall(regex,cleaned_item)
        if m:
            #print("Deleting underscores _")
            cleaned_item = re.sub(regex,'',cleaned_item)   

        #Code to reduce trailing characters
        regex = r"(.)\1{0,}(?=\1\1)"
        m = re.findall(regex,cleaned_item)
        if m:
            #print("Found trailing alphabet. Removing excess.")
            cleaned_item = re.sub(regex,'',cleaned_item)

        #Code to remove emoticons (basic)
        regex = r":-\)|:\)|:-\]|:\]|:-3|:3|:->|:>|8-\)|8\)|:-}|:}|:o\)|:c\)|:\^\)|=\]|=\)|:\(|=\(|:-D|:D|8-D|8D|x-D|xD|X-D|XD|=D|=3|B\^D|:-\)\)|:-\(|:-c|:c|:-<|:<|:-\[|:\[|>:\[|:{|:@|;\(|:'‑\(|:'\(|:'-\)|:\'\)|D-':|D:<|D:|D8|D;|D=|DX|:‑O|:O|:‑o|:o|:-0|8-0|>:O|:-\|:\|:×|;-\)|;\)|;-]|;]|;\^\)|;D|:-P|:-P|:-P|:P|X-P|XP|x-p|xp|:-p|:p|:-b|:b|d:|=p|>:P|:-\/|:\/|>:\\|>:\/|:\\|=\/|=\\|:L|=L|:S|:‑\||:\||:\$|:-X|:X|:‑#|:#|:&|:-&|>:‑\)|>;\)|>:3|<\/3|<3|\\o\/|\\\0\/\|v.v|>.<|[Oo][.|][oO]|[uU]w[uU]|;:\)|:-[Pp]|\'[o|-]\'|x[0|o|O]|\-\\-|:\||\^[|.]\^|\*\\*|:\*"
        m = re.findall(regex, cleaned_item)
        if m:
            cleaned_item = re.sub(regex,'',cleaned_item)

        #Script to replace words
        
        ##Replace 'll to will
        regex = r"'ll"
        m = re.findall(regex, cleaned_item)
        if m:
            cleaned_item = re.sub(regex,' will',cleaned_item)
            
        ##Replace 've to have
        regex = r"'ve"
        m = re.findall(regex, cleaned_item)
        if m:
            cleaned_item = re.sub(regex,' have',cleaned_item)
            
        ##Replace don't to do not
        regex = r"[Dd]on'?t"
        m = re.findall(regex, cleaned_item)
        if m:
            cleaned_item = re.sub(regex,' do not',cleaned_item)
            
        ##Replace aren't to are not
        regex = r"[Aa]ren'?t"
        m = re.findall(regex, cleaned_item)
        if m:
            cleaned_item = re.sub(regex,' are not',cleaned_item)
            
        ##Replace won't to will not
        regex = r"[wW]on'?t"
        m = re.findall(regex, cleaned_item)
        if m:
            cleaned_item = re.sub(regex,' will not',cleaned_item)
            
        ##Replace can't to can not
        regex = r"[cC]an'?t"
        m = re.findall(regex, cleaned_item)
        if m:
            cleaned_item = re.sub(regex,' can not',cleaned_item)
            
        ##Replace shan't to shall not
        regex = r"[sS]han'?t"
        m = re.findall(regex, cleaned_item)
        if m:
            cleaned_item = re.sub(regex,' shall not',cleaned_item)
            
        ##Replace 'm to am
        regex = r"'m"
        m = re.findall(regex, cleaned_item)
        if m:
            cleaned_item = re.sub(regex,' am',cleaned_item)
            
        ##Replace doesn't to does not
        regex = r"[dD]oesn'?t"
        m = re.findall(regex, cleaned_item)
        if m:
            cleaned_item = re.sub(regex,' does not',cleaned_item)
            
        ##Replace didn't to did not
        regex = r"[dD]idn'?t"
        m = re.findall(regex, cleaned_item)
        if m:
            cleaned_item = re.sub(regex,' did not',cleaned_item)
            
        ##Replace hasn't to has not
        regex = r"[hH]asn'?t"
        m = re.findall(regex, cleaned_item)
        if m:
            cleaned_item = re.sub(regex,' has not',cleaned_item)
            
        ##Replace haven't to have not
        regex = r"[hH]aven'?t"
        m = re.findall(regex, cleaned_item)
        if m:
            cleaned_item = re.sub(regex,' have not',cleaned_item)
            
        ##Replace wouldn't to would not
        regex = r"[wW]ouldn'?t"
        m = re.findall(regex, cleaned_item)
        if m:
            cleaned_item = re.sub(regex,' would not',cleaned_item)
            
        ##Replace it's to it is
        regex = r"[iI]t's"
        m = re.findall(regex, cleaned_item)
        if m:
            cleaned_item = re.sub(regex,' it is',cleaned_item)
            
        ##Replace that's to that is
        regex = r"[tT]hat's"
        m = re.findall(regex, cleaned_item)
        if m:
            cleaned_item = re.sub(regex,' would not',cleaned_item)
            
        ##Replace weren't to were not
        regex = r"[wW]eren'?t"
        m = re.findall(regex, cleaned_item)
        if m:
            cleaned_item = re.sub(regex,' were not',cleaned_item)
        
        ##Remove "" from text
        p = re.compile('\"')
        m = p.findall(cleaned_item)
        if m:
            cleaned_item = re.sub('\"','',cleaned_item)
        
        #Code to compress white spaces into one
        regex = r"\s\s+"
        m = re.findall(regex,cleaned_item)
        if m:
            #print("Found trailing whitespaces... Replacing with one.")
            cleaned_item = re.sub(regex,' ',cleaned_item)

        #append to cleaned list if it contains at least an alphabet
        if (not (cleaned_item.isspace())) and re.search('[a-zA-Z]', cleaned_item):
            someList.append(cleaned_item.lower())
            print("---After encoding/decoding at row "+str(len(someList)))#Print line
            print(cleaned_item.lower())#Print appended result
    
    print()
    print("Total len of List: "+str(len(someList)))
    return someList, len(someList)

In [None]:
### Save batch data as .txt file
def saveTxt(inputFile,outputFile,destFolder):
    with open(destFolder+outputFile, 'w') as f:
        for item in inputFile:
            f.write("%s\n" % item)

In [None]:
# initiate empty list to store number of lines of the cleaned .xlsx file
noOfLinesList = list()

In [None]:
# Save the text corpus in batches
# batch 1
dataBatch1, noOfLines1 = cleanedData(batch1)
# append number of lines into the list
noOfLinesList.append(noOfLines1)

iend of mine. i think about it everyday, what i should have said.be if i pushed him off earlier, maybe if i had said no louder, it would not have happened. coping was even harder. the only thought in my mind was \i am not a virgin anymore.confess at: 

---After encoding/decoding at row 63429
.that rude idiot who tries to challenge the prof. try that in ethics class where majority of the profs are law profs and see whether u can get out 'alive'. disrespectful. think so smart already is it.be you are but when you do not even have basic manners, you are lower than those who are uneducated.confess at: 

---After encoding/decoding at row 63430
.i am currently with a married man 2 decades my senior.confess at: 

---After encoding/decoding at row 63431
.why are the so many crybabies lamenting about how low is their gpa when it is still above 3? i am in my 3-2 with cgpa 2.61 and i couldn't care less. most companies will not ask for your gpa anyway.confess at: 

---After encoding/decoding at ro

In [None]:
# Save batched corpus as .txt file
saveTxt(dataBatch1,"dataBatch1.txt",TEMP)

In [None]:
## NEXT BATCH

In [None]:
# Save the text corpus in batches
# batch 2
dataBatch2, noOfLines2 = cleanedData(batch2)
# append number of lines into the list
noOfLinesList.append(noOfLines2)

hacks and tips to help you ace your exams! 

---After encoding/decoding at row 18620
hahaha chicken nut bread 

---After encoding/decoding at row 18621
heres a quick guide on how to recoup your chinese new year losses!! use this special deal here confirm can one!!

---After encoding/decoding at row 18622
imagine this happening to you 

---After encoding/decoding at row 18623
the best cny excuse generator for all those annoying questions by your relatives.- by the sibeh cute wuba from monster hunt 2 thats in cinemas !!.pause the video to generate your excuse!

---After encoding/decoding at row 18624
hahaha these chinese new year lame jokes confirm will make you go. huat did i just hear?

---After encoding/decoding at row 18625
you guys asked for it, so here it is: how to be an insurance agent in singapore 101!! 

---After encoding/decoding at row 18626
when someone tries to start a diet during chinese new year! .its always a lost cause with all the pineapple tarts. bak kwas. love letter

In [None]:
# Save batched corpus as .txt file
saveTxt(dataBatch2,"dataBatch2.txt",TEMP)

In [None]:
# Save batched corpus as .txt file
#saveTxt(dataBatch3,"dataBatch3.txt",TEMP)

In [None]:
# get total number of lines of text in the corpus
sum(noOfLinesList)

82225

## Merge all .txt batches into one big corpus

In [None]:
import glob

read_files = glob.glob("./tempData/*.txt")

with open("./finalData/SgCorpus.txt", "wb") as outfile:
    for f in read_files:
        with open(f, "rb") as infile:
            outfile.write(infile.read())