In [3]:
import pandas as pd
from tabulate import tabulate
import pickle

from numpy import nan

import csv
import re

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from collections import Counter

from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

### 1.0 Import dataset

In [4]:
#Load data into data frame
data = pd.read_csv('Resources/dataset.csv',header = 0, names = ["tweet_id", "sentiment", "author", "content"], usecols = ["sentiment", "content"])


#Clean the missing data
count = 0
for line in data.content:
    if line in ['0', nan]:
        data = data.drop(data.index[count])
        count = count - 1
    count = count + 1
data = data.reset_index(drop=True)

# print(tabulate(data.head(10), showindex=True, headers=data.columns))

### 1.1 Save the original data

In [5]:
#Save the processed data into csv
data.to_csv('Outputs\csv\preprocessed_data_1.csv', index = False)

#Save the file in pickle format
outputs = open('Outputs\pkl\preprocessed_data_1.pkl','wb')
pickle.dump(data, outputs)
outputs.close()

### 1.2 Replace the emoticons with words

In [6]:
#Replace the emoticons
#Smile
data.content = data.content.str.replace(r':\)', 'happy')
data.content = data.content.str.replace(r':-\)', 'happy')
data.content = data.content.str.replace(r'=\)', 'happy')
data.content = data.content.str.replace(r':]', 'happy')
data.content = data.content.str.replace(r'=]', 'happy')

#Sad
data.content = data.content.str.replace(r':\(', 'sad')
data.content = data.content.str.replace(r':-\(', 'sad')
data.content = data.content.str.replace(r'=\(', 'sad')
data.content = data.content.str.replace(r':\[', 'sad')
data.content = data.content.str.replace(r'=\[', 'sad')

#Surprise
data.content = data.content.str.replace(r':-O', 'surprise')

#Angry
data.content = data.content.str.replace(r':-@', 'angry')

#Confused
data.content = data.content.str.replace(r':-$', 'confused')

#Secret
data.content = data.content.str.replace(r':-#', 'secret')

#Rolling eyes
data.content = data.content.str.replace(r'@@', 'rolling eyes')

#Laughing
data.content = data.content.str.replace(r':-D', 'laughing')

#Winking smile
data.content = data.content.str.replace(r';\)', '')
data.content = data.content.str.replace(r';-\)', '')

#Happy crying
data.content = data.content.str.replace(r":'\)", 'happy crying')
data.content = data.content.str.replace(r":'-\)", 'happy crying')

#Smile with tongue hanging out
data.content = data.content.str.replace(r':p', 'smile with tongue hanging out')

#Save the processed data into csv
data.to_csv('Outputs\csv\preprocessed_data_2.csv', index = False)

#Save the file in pickle format
outputs = open('Outputs\pkl\preprocessed_data_2.pkl','wb')
pickle.dump(data, outputs)
outputs.close()


# print(tabulate(data.head(26), showindex=True, headers=data.columns))

### 1.3 Text Processing :
1. Remove hashtag but remain the words
2. Remove Twitter username mention
3. Remove URL
4. Remove &amp
5. Convert the letter to lower case

In [7]:
#Remove hashtag but remain the words
data.content = data.content.str.replace(r'#', '')

#Remove Twitter username mention
data.content = data.content.str.replace(r'@\S+', '')

#Remove URL
data.content = data.content.str.replace(r'http\S+', 'website')

#Remove &amp
data.content = data.content.str.replace(r'&amp\S+', '')

#Convert the letter to lower case
data.content = data.content.str.lower()

#Clean the missing data
count = 0
for line in data.content:
    if line in ['0', nan,'']:
        data = data.drop(data.index[count])
        count = count - 1
    count = count + 1
data = data.reset_index(drop=True)


#Save the processed data into csv
data.to_csv('Outputs\csv\preprocessed_data_3.csv', index = False)

#Save the file in pickle format
outputs = open('Outputs\pkl\preprocessed_data_3.pkl','wb')
pickle.dump(data, outputs)
outputs.close()

# print(tabulate(data.head(10), showindex=True, headers=data.columns))

### 1.4 Remove the punctuation

In [8]:
#Limit the punctuation
data.content = data.content.str.replace(r'[^\w\s]', '')

#Clean the missing data
count = 0
for line in data.content:
    if line in ['0', nan,'']:
        data = data.drop(data.index[count])
        count = count - 1
    count = count + 1
data = data.reset_index(drop=True)

##Save the processed data into csv
data.to_csv('Outputs\csv\preprocessed_data_4.csv', index = False)

#Save the file in pickle format
outputs = open('Outputs\pkl\preprocessed_data_4.pkl','wb')
pickle.dump(data, outputs)
outputs.close()

# print(tabulate(data.head(10), showindex=True, headers=data.columns))

### 1.5 Reconstruct the abbrevations

In [9]:
#Reconstruct the abbrevations
with open('Resources/Abbreviations.csv', mode = 'r') as infile:
    lower_stream = (line.lower() for line in infile)
    reader = csv.reader(lower_stream)
    mydict = {rows[0]:rows[1] for rows in reader}
#     print(mydict)
    
def process(dat):
    count = 0
    dat = dat.str.lower()
    for line in dat:
        da = ''.join( mydict.get( word, word ) for word in re.split( '(\W+)', str(line) ) )
        data.content[count] = da
        count = count + 1

process(data.content)

##Save the processed data into csv
data.to_csv('Outputs\csv\preprocessed_data_5.csv', index = False)

#Save the file in pickle format
outputs = open('Outputs\pkl\preprocessed_data_5.pkl','wb')
pickle.dump(data, outputs)
outputs.close()

# print(tabulate(data.head(10), showindex=True, headers=data.columns))

### 1.6 Stop words removal

In [10]:
#Remove stop words
# A set of stop words to filter the filler words
stop_words = set(stopwords.words('english'))
# print(stop_words)

for num in range(len(data.content)):
    word_tokens = word_tokenize(data.content[num])
    data.content[num] = [w for w in word_tokens if not w in stop_words]

    
#Clean the missing data
count = 0
for line in data.content:
    if line in ['0', nan]:
        print(line)
        data = data.drop(data.index[count])
        count = count - 1
    count = count + 1
data = data.reset_index(drop=True)


##Save the processed data into csv
data.to_csv('Outputs\csv\preprocessed_data_6.csv', index = False)

#Save the file in pickle format
outputs = open('Outputs\pkl\preprocessed_data_6.pkl','wb')
pickle.dump(data, outputs)
outputs.close()

# print(tabulate(data.head(10), showindex=True, headers=data.columns))

### 1.7 Word Correction

In [11]:
#Correct the words
def words(text): return re.findall(r'\w+', text.lower())

WORDS = Counter(words(open('big.txt').read()))

def P(word, N=sum(WORDS.values())): 
    "Probability of `word`."
    return WORDS[word] / N

def correction(word): 
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)

def candidates(word): 
    "Generate possible spelling corrections for word."
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(words): 
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))
    

In [12]:
#Run the function
count = 0
for line in data.content:
    temp =[]
    for word in line:
            temp.append(correction(word))
    data.content[count] = temp
    count = count + 1

#Save the processed data into csv
data.to_csv('Outputs\csv\preprocessed_data_7.csv', index = False)

#Save the file in pickle format
outputs = open('Outputs\pkl\preprocessed_data_7.pkl','wb')
pickle.dump(data, outputs)
outputs.close()

## 1.8 Stem the words

In [13]:
#Stemming the words
ps = PorterStemmer()

for num in range(len(data.content)):
    data.content[num] = [ps.stem(word) for word in data.content[num]]

#Save the processed data into csv
data.to_csv('Outputs\csv\preprocessed_data_8.csv', index = False)

#Save the file in pickle format
outputs = open('Outputs\pkl\preprocessed_data_8.pkl','wb')
pickle.dump(data, outputs)
outputs.close()