# __Part II: Data Preprocessing__

## __Import Libraries and Dataset__

## 1 Import Libraries

In [1]:
import numpy as np
import pandas as pd
from collections import Counter
import html
import re
from nltk.sentiment.util import mark_negation
from nltk.corpus import stopwords
from stop_words import get_stop_words
import string
import contractions
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import utils

Using TensorFlow backend.


In [2]:
# Set dataframe display
pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_colwidth', -1)

In [3]:
# Define path of data
data_folder = "C:\\Users\\DoZoYo\\OneDrive - Mahidol University\\611 Advanced ML\\Sentiment Analysis\\data\\"

## 2 Import dataset to dataframe

> For score = 1 or 2 means Negative(-1) <br>
> For score = 3 means Neutral(0) <br>
> For score = 4 or 5 means Positive(1)

In [4]:
# Import csv file to dataframe and label classes
def import_data(data_folder, file_name):
    df = pd.read_csv(data_folder + file_name + ".csv", 
                     delimiter='|', 
                     encoding='utf-8', 
                     engine='python')
    y = {1:-1, 2:-1, 3:0, 4:1, 5:1}
    df['class'] = df['score'].map(y)
    df['hospital'] = file_name
    return df

file_names = ['corpus_rama','corpus_siriraj','corpus_chula']

df_tmp = {}
for file_name in file_names:
    df_tmp[file_name]= import_data(data_folder, file_name)

In [5]:
# Pooling all comments of Ramathibodi, Siriraj, and Chula hospital
df = pd.concat(df_tmp.values(), ignore_index=True)
df.head()

Unnamed: 0,comment_th,comment_eng,score,class,hospital
0,เคยได้ไปลองใช้บริการคลีนิคนอกเวลาของรามาเมื่อต้นปีที่แล้ว ได้เข้ารักษาเลเซอร์บนในหน้า พยสบสลกับคุณหมอใจดีมากๆ เป็นกันเอง ดูแลอย่างทั่วถึง รอคิวไม่นานเลย แต่ต้องโทรไปจองล่วงหน้าก่อนน่ะค่ะ การรักศึกษาของคุณหมอมไ่เจ็บเลยค่ะ ไม่ถึง5นาทีการรักษา ผลลัพที่ออกมารอยใบบนหน้าหายหมดเลยค่ะ ค่ารักรักษาพยนบาลไม่แพงด้วย,"Used to try Ramadan part-time clinic earlier this year Got onto the top laser treatment in the face The doctor is very kind and friendly. Care thoroughly. Waiting for a long time But have to call to reserve in advance The love of the doctor&#39;s education hurts me. Less than 5 minutes treatment The result came out, the leaves on the face disappeared. Love fee is not expensive as well.",5,1,corpus_rama
1,ผมเคยไปทำเลสิกที่นี้ ดีมากครับตอนนี้ผ่ามาจะ2ปีแล้วยังไม่มีผลค้างเคียงอะไรและราคาก็ถูกดีด้วย ตอนผ่าอาจจะดูน่ากลัวไปซะหน่อย แต่พอถึงเวลาผ่าจริงๆแล้วไม่รู้สึกเจ็บเลยครับ เพราะมีการหยอดยาชาก่อนผ่า,"I used to go to LASIK here. Very good. Now, it has been 2 years since it has no effect. And the price is also good. The cut may look scary. But when it was really time to do it, it didn&#39;t hurt. Because there was a drop of anesthetic before dissection",4,1,corpus_rama
2,รักษามาหลายปีแล้วตั้งแต่อายุ14 ตอนนี้จะจบปริญญาตรีแล้ว ตั้งแต่นอนแอดมิดที่ รพ จนหายดีออกมาใช้ชีวิตได้เป็นปกติ โดยเฉพาะคลินิกพรีเมี่ยม ไม่ต้องรอนาน มีความสะดวกรวดเร็วทุกขั้นตอนตั้งแต่จ่ายเงินไปถึงจ่ายยา ค่ารักษาไม่ได้แพงเกินไปเมื่อเทียบเท่ากับเอกชนหลายๆที่ บริการดีมากๆ คุณหมอดูแลเอาใจใส่ดี อยากให้ทุกคนมารักษาที่นี่,"Maintained for many years since the age of 14 Will now have a bachelor&#39;s degree From sleeping in the midst of the arthropod until he recovered, he could live a normal life. In particular, premium clinics do not have to wait for a long time. There are convenient, fast, every step from paying to dispensing. The treatment fee is not too expensive when compared to many private services that are very good. Want everyone to come here",5,1,corpus_rama
3,ที่ตึกพระเทพชั้น 2 สถานที่เครื่องมือดูทันสมัย แต่ที่ห้องวัดความดัน มีคนที่วัด(คิดว่าไม่ใช่พยาบาลเพราะใส่ชุดสีม่วง) ชอบดุคนที่มาวัด พูดเสียงแข็งใส่คนที่มาวัดความดัน ทั้งๆที่พยาบาลอีกคนที่ใส่ชุดพยาบาลพูดเพราะ อยากให้ปรับปรุง มาหาหมอที่แผนกศัลยกรรม อาจารย์หมอตรวตแล้วบอกต้องผ่าตัด บอกแค่ว่าต้องผ่าตัดถึงจะหายตอนนั้นช็อคเรื่องผ่าตัดอยู่เลยไม่ได้ถามอะไรเลย แล้วหมอกก็ไปเลยไม่ได้บอกว่าจริงๆเป็นอะไร ระยะไหน (ริดสีดวง) แล้วก็นัดวันผ่า พยาบาลที่นัดคิวก็ไม่สนเวลาเราทวนสิ่งที่บอกมาว่าถูกต้องไหม บอกแค่ว่าตามนั้นแหละ สรุปแล้วเหมือนจะดีแต่รอดูช่วงผ่าตัดก่อน,"At the 2nd floor of Phra Thep building, the place looks modern. But at the pressure measuring room There are people at the temple (thinking that it&#39;s not a nurse because wearing a purple dress) likes to scold people who come to the temple. Speak stiffly to people who come to measure pressure. In spite of the other nurses who put on a nursing gown, said because Want to improve Come to the doctor at the surgery department Dr. Tawut, then told to have surgery. Just saying that he had to surgery to be healed at that time, he was shocked about the surgery. And the fog went, not really telling what was the distance (Roi-Laem) and then meeting The nurse at Nuek Kyu doesn&#39;t care when we repeat what he says is correct. Just saying that In conclusion, it seems to be good, but waiting to see the surgery before",3,0,corpus_rama
4,"Its a public hospital so service is bad (nurses are completely unfriendly and doctors have god complex plus horrifying bed side manner) but they will keep you alive (nothing to do with quality of life) with decent drug collection. Young doctors are generally very friendly but they are busy all the time due to the amount of patients (like canned tuna) so don't expect them to spend too much time diagnosing you. You have to come early in the morning to ensure you're in queue and the wait 1-2 (bad days) hours behind schedule to get to see the doctor for < 15 min or less (all together door to door 8-9hrs). Additionally you can call all the 8 numbers on the websites from normal clinic to Premium and wait 15 minutes on each number just for them to hang up on you or divert you elsewhere then hang up on you..... Worst are the guys doing the schedule because they are very consistent in demonstrating the inability to manage schedules (or not) because you will ""always"" have to wait 1-2 hours ""after"" your appointment schedule to see the doctor but they also cant tell you that so you cant go anywhere and have to hang around the front door. In the better days you will get to sit inside the doctor's room to wait because the doctor is no where to be found and nobody will know when he/she will come back. If you ask the nurse they will give you the ""i dont know and I couldnt care less"" look and turn away in annoyance. But yes, they have great ability (the doctors definitely know what they are doing) to use ur tax fund to keep you alive, two stars for that!! Updated review for ER: The best ER u can probably find in Bangkok as they are 24hrs fully staffed with specialty doctors (not just general doctors like other hospitals during off business hours). They are jammed packed but they are fully equipped and by that I mean they have CT scanner in the ER room - this is not something we find everywhere. Doctors know what they are doing and fully equipped with the tools to keep u alive. Even better update: Just decided to go for full blown annual health check at Rama (Feb 2016) and here are the pros and cons: Cons:As a foreigner who pays tax in Thailand ur sort of half screwed. They are not equipped to handle foreign language so if u cant google medical terms properly u might go home thinking everything is perfect when its not...A less life threatening case is that if u pay tax to Thailand ur entitled to Thai citizen price 30% less or so which is surprisingly logical BUT not without flaws...u have to bring the ""physical"" ""original"" tax receipt from revenue dept to show then which u DONT get if u file electronically....so....who does NOT file electronically? I dont know one....maybe my parents but they're 75.... Pros:Omg its a very pleasant surprise that this clinic operates with the speed, manner, and professionalism that you'd expect in that of top 5 high end private hospital. The ambiance, the friendliness, politeness (which usually does not happen in public hospital), and the facility is great. I am very impressed with the speed at which they operate so thats 8.30-12:00 easy. Just like u would get at private hospital. 4th floor Prathep bldg zone Q - they did an exceptional job of scanning our x ray and ultrasound result 2-3 times just to make sure its accurate and they call u of the pending result (in case there is a need for special radiologist to check the films) in less than 4 hours after doctor visit. Overall - hell yeah I will do my Health check here going forward for 1/3 the price i would pay otherwise...:","Its a public hospital so service is bad (nurses are completely unfriendly and doctors have god complex plus horrifying bed side manner) but they will keep you alive (nothing to do with quality of life) with decent drug collection. Young doctors are generally very friendly but they are busy all the time due to the amount of patients (like canned tuna) so don't expect them to spend too much time diagnosing you. You have to come early in the morning to ensure you're in queue and the wait 1-2 (bad days) hours behind schedule to get to see the doctor for < 15 min or less (all together door to door 8-9hrs). Additionally you can call all the 8 numbers on the websites from normal clinic to Premium and wait 15 minutes on each number just for them to hang up on you or divert you elsewhere then hang up on you..... Worst are the guys doing the schedule because they are very consistent in demonstrating the inability to manage schedules (or not) because you will ""always"" have to wait 1-2 hours ""after"" your appointment schedule to see the doctor but they also cant tell you that so you cant go anywhere and have to hang around the front door. In the better days you will get to sit inside the doctor's room to wait because the doctor is no where to be found and nobody will know when he/she will come back. If you ask the nurse they will give you the ""i dont know and I couldnt care less"" look and turn away in annoyance. But yes, they have great ability (the doctors definitely know what they are doing) to use ur tax fund to keep you alive, two stars for that!! Updated review for ER: The best ER u can probably find in Bangkok as they are 24hrs fully staffed with specialty doctors (not just general doctors like other hospitals during off business hours). They are jammed packed but they are fully equipped and by that I mean they have CT scanner in the ER room - this is not something we find everywhere. Doctors know what they are doing and fully equipped with the tools to keep u alive. Even better update: Just decided to go for full blown annual health check at Rama (Feb 2016) and here are the pros and cons: Cons:As a foreigner who pays tax in Thailand ur sort of half screwed. They are not equipped to handle foreign language so if u cant google medical terms properly u might go home thinking everything is perfect when its not...A less life threatening case is that if u pay tax to Thailand ur entitled to Thai citizen price 30% less or so which is surprisingly logical BUT not without flaws...u have to bring the ""physical"" ""original"" tax receipt from revenue dept to show then which u DONT get if u file electronically....so....who does NOT file electronically? I dont know one....maybe my parents but they're 75.... Pros:Omg its a very pleasant surprise that this clinic operates with the speed, manner, and professionalism that you'd expect in that of top 5 high end private hospital. The ambiance, the friendliness, politeness (which usually does not happen in public hospital), and the facility is great. I am very impressed with the speed at which they operate so thats 8.30-12:00 easy. Just like u would get at private hospital. 4th floor Prathep bldg zone Q - they did an exceptional job of scanning our x ray and ultrasound result 2-3 times just to make sure its accurate and they call u of the pending result (in case there is a need for special radiologist to check the films) in less than 4 hours after doctor visit. Overall - hell yeah I will do my Health check here going forward for 1/3 the price i would pay otherwise...:",4,1,corpus_rama


In [6]:
### According to the sampling display, '&#39;' is found.
### '&#39;' is one of HTML entities and it means aposthophe (')
### It should be replaced with "'"
df['comment_eng'] = df['comment_eng'].apply(lambda x: html.unescape(x))
df.head()

Unnamed: 0,comment_th,comment_eng,score,class,hospital
0,เคยได้ไปลองใช้บริการคลีนิคนอกเวลาของรามาเมื่อต้นปีที่แล้ว ได้เข้ารักษาเลเซอร์บนในหน้า พยสบสลกับคุณหมอใจดีมากๆ เป็นกันเอง ดูแลอย่างทั่วถึง รอคิวไม่นานเลย แต่ต้องโทรไปจองล่วงหน้าก่อนน่ะค่ะ การรักศึกษาของคุณหมอมไ่เจ็บเลยค่ะ ไม่ถึง5นาทีการรักษา ผลลัพที่ออกมารอยใบบนหน้าหายหมดเลยค่ะ ค่ารักรักษาพยนบาลไม่แพงด้วย,"Used to try Ramadan part-time clinic earlier this year Got onto the top laser treatment in the face The doctor is very kind and friendly. Care thoroughly. Waiting for a long time But have to call to reserve in advance The love of the doctor's education hurts me. Less than 5 minutes treatment The result came out, the leaves on the face disappeared. Love fee is not expensive as well.",5,1,corpus_rama
1,ผมเคยไปทำเลสิกที่นี้ ดีมากครับตอนนี้ผ่ามาจะ2ปีแล้วยังไม่มีผลค้างเคียงอะไรและราคาก็ถูกดีด้วย ตอนผ่าอาจจะดูน่ากลัวไปซะหน่อย แต่พอถึงเวลาผ่าจริงๆแล้วไม่รู้สึกเจ็บเลยครับ เพราะมีการหยอดยาชาก่อนผ่า,"I used to go to LASIK here. Very good. Now, it has been 2 years since it has no effect. And the price is also good. The cut may look scary. But when it was really time to do it, it didn't hurt. Because there was a drop of anesthetic before dissection",4,1,corpus_rama
2,รักษามาหลายปีแล้วตั้งแต่อายุ14 ตอนนี้จะจบปริญญาตรีแล้ว ตั้งแต่นอนแอดมิดที่ รพ จนหายดีออกมาใช้ชีวิตได้เป็นปกติ โดยเฉพาะคลินิกพรีเมี่ยม ไม่ต้องรอนาน มีความสะดวกรวดเร็วทุกขั้นตอนตั้งแต่จ่ายเงินไปถึงจ่ายยา ค่ารักษาไม่ได้แพงเกินไปเมื่อเทียบเท่ากับเอกชนหลายๆที่ บริการดีมากๆ คุณหมอดูแลเอาใจใส่ดี อยากให้ทุกคนมารักษาที่นี่,"Maintained for many years since the age of 14 Will now have a bachelor's degree From sleeping in the midst of the arthropod until he recovered, he could live a normal life. In particular, premium clinics do not have to wait for a long time. There are convenient, fast, every step from paying to dispensing. The treatment fee is not too expensive when compared to many private services that are very good. Want everyone to come here",5,1,corpus_rama
3,ที่ตึกพระเทพชั้น 2 สถานที่เครื่องมือดูทันสมัย แต่ที่ห้องวัดความดัน มีคนที่วัด(คิดว่าไม่ใช่พยาบาลเพราะใส่ชุดสีม่วง) ชอบดุคนที่มาวัด พูดเสียงแข็งใส่คนที่มาวัดความดัน ทั้งๆที่พยาบาลอีกคนที่ใส่ชุดพยาบาลพูดเพราะ อยากให้ปรับปรุง มาหาหมอที่แผนกศัลยกรรม อาจารย์หมอตรวตแล้วบอกต้องผ่าตัด บอกแค่ว่าต้องผ่าตัดถึงจะหายตอนนั้นช็อคเรื่องผ่าตัดอยู่เลยไม่ได้ถามอะไรเลย แล้วหมอกก็ไปเลยไม่ได้บอกว่าจริงๆเป็นอะไร ระยะไหน (ริดสีดวง) แล้วก็นัดวันผ่า พยาบาลที่นัดคิวก็ไม่สนเวลาเราทวนสิ่งที่บอกมาว่าถูกต้องไหม บอกแค่ว่าตามนั้นแหละ สรุปแล้วเหมือนจะดีแต่รอดูช่วงผ่าตัดก่อน,"At the 2nd floor of Phra Thep building, the place looks modern. But at the pressure measuring room There are people at the temple (thinking that it's not a nurse because wearing a purple dress) likes to scold people who come to the temple. Speak stiffly to people who come to measure pressure. In spite of the other nurses who put on a nursing gown, said because Want to improve Come to the doctor at the surgery department Dr. Tawut, then told to have surgery. Just saying that he had to surgery to be healed at that time, he was shocked about the surgery. And the fog went, not really telling what was the distance (Roi-Laem) and then meeting The nurse at Nuek Kyu doesn't care when we repeat what he says is correct. Just saying that In conclusion, it seems to be good, but waiting to see the surgery before",3,0,corpus_rama
4,"Its a public hospital so service is bad (nurses are completely unfriendly and doctors have god complex plus horrifying bed side manner) but they will keep you alive (nothing to do with quality of life) with decent drug collection. Young doctors are generally very friendly but they are busy all the time due to the amount of patients (like canned tuna) so don't expect them to spend too much time diagnosing you. You have to come early in the morning to ensure you're in queue and the wait 1-2 (bad days) hours behind schedule to get to see the doctor for < 15 min or less (all together door to door 8-9hrs). Additionally you can call all the 8 numbers on the websites from normal clinic to Premium and wait 15 minutes on each number just for them to hang up on you or divert you elsewhere then hang up on you..... Worst are the guys doing the schedule because they are very consistent in demonstrating the inability to manage schedules (or not) because you will ""always"" have to wait 1-2 hours ""after"" your appointment schedule to see the doctor but they also cant tell you that so you cant go anywhere and have to hang around the front door. In the better days you will get to sit inside the doctor's room to wait because the doctor is no where to be found and nobody will know when he/she will come back. If you ask the nurse they will give you the ""i dont know and I couldnt care less"" look and turn away in annoyance. But yes, they have great ability (the doctors definitely know what they are doing) to use ur tax fund to keep you alive, two stars for that!! Updated review for ER: The best ER u can probably find in Bangkok as they are 24hrs fully staffed with specialty doctors (not just general doctors like other hospitals during off business hours). They are jammed packed but they are fully equipped and by that I mean they have CT scanner in the ER room - this is not something we find everywhere. Doctors know what they are doing and fully equipped with the tools to keep u alive. Even better update: Just decided to go for full blown annual health check at Rama (Feb 2016) and here are the pros and cons: Cons:As a foreigner who pays tax in Thailand ur sort of half screwed. They are not equipped to handle foreign language so if u cant google medical terms properly u might go home thinking everything is perfect when its not...A less life threatening case is that if u pay tax to Thailand ur entitled to Thai citizen price 30% less or so which is surprisingly logical BUT not without flaws...u have to bring the ""physical"" ""original"" tax receipt from revenue dept to show then which u DONT get if u file electronically....so....who does NOT file electronically? I dont know one....maybe my parents but they're 75.... Pros:Omg its a very pleasant surprise that this clinic operates with the speed, manner, and professionalism that you'd expect in that of top 5 high end private hospital. The ambiance, the friendliness, politeness (which usually does not happen in public hospital), and the facility is great. I am very impressed with the speed at which they operate so thats 8.30-12:00 easy. Just like u would get at private hospital. 4th floor Prathep bldg zone Q - they did an exceptional job of scanning our x ray and ultrasound result 2-3 times just to make sure its accurate and they call u of the pending result (in case there is a need for special radiologist to check the films) in less than 4 hours after doctor visit. Overall - hell yeah I will do my Health check here going forward for 1/3 the price i would pay otherwise...:","Its a public hospital so service is bad (nurses are completely unfriendly and doctors have god complex plus horrifying bed side manner) but they will keep you alive (nothing to do with quality of life) with decent drug collection. Young doctors are generally very friendly but they are busy all the time due to the amount of patients (like canned tuna) so don't expect them to spend too much time diagnosing you. You have to come early in the morning to ensure you're in queue and the wait 1-2 (bad days) hours behind schedule to get to see the doctor for < 15 min or less (all together door to door 8-9hrs). Additionally you can call all the 8 numbers on the websites from normal clinic to Premium and wait 15 minutes on each number just for them to hang up on you or divert you elsewhere then hang up on you..... Worst are the guys doing the schedule because they are very consistent in demonstrating the inability to manage schedules (or not) because you will ""always"" have to wait 1-2 hours ""after"" your appointment schedule to see the doctor but they also cant tell you that so you cant go anywhere and have to hang around the front door. In the better days you will get to sit inside the doctor's room to wait because the doctor is no where to be found and nobody will know when he/she will come back. If you ask the nurse they will give you the ""i dont know and I couldnt care less"" look and turn away in annoyance. But yes, they have great ability (the doctors definitely know what they are doing) to use ur tax fund to keep you alive, two stars for that!! Updated review for ER: The best ER u can probably find in Bangkok as they are 24hrs fully staffed with specialty doctors (not just general doctors like other hospitals during off business hours). They are jammed packed but they are fully equipped and by that I mean they have CT scanner in the ER room - this is not something we find everywhere. Doctors know what they are doing and fully equipped with the tools to keep u alive. Even better update: Just decided to go for full blown annual health check at Rama (Feb 2016) and here are the pros and cons: Cons:As a foreigner who pays tax in Thailand ur sort of half screwed. They are not equipped to handle foreign language so if u cant google medical terms properly u might go home thinking everything is perfect when its not...A less life threatening case is that if u pay tax to Thailand ur entitled to Thai citizen price 30% less or so which is surprisingly logical BUT not without flaws...u have to bring the ""physical"" ""original"" tax receipt from revenue dept to show then which u DONT get if u file electronically....so....who does NOT file electronically? I dont know one....maybe my parents but they're 75.... Pros:Omg its a very pleasant surprise that this clinic operates with the speed, manner, and professionalism that you'd expect in that of top 5 high end private hospital. The ambiance, the friendliness, politeness (which usually does not happen in public hospital), and the facility is great. I am very impressed with the speed at which they operate so thats 8.30-12:00 easy. Just like u would get at private hospital. 4th floor Prathep bldg zone Q - they did an exceptional job of scanning our x ray and ultrasound result 2-3 times just to make sure its accurate and they call u of the pending result (in case there is a need for special radiologist to check the films) in less than 4 hours after doctor visit. Overall - hell yeah I will do my Health check here going forward for 1/3 the price i would pay otherwise...:",4,1,corpus_rama


In [7]:
# Distribution over classes of Positive(1), Negative(-1), and Neutral(0)
print(df['class'].value_counts())

 1    475
-1    118
 0    56 
Name: class, dtype: int64


## __Data Splitting: Training, Validation, Test__

### Split data to 60:20:20 for training, validation, and test data set, respectively

> Use only POSITIVE(1) and NEGATIVE(-1) classes because these classes are more meaningful compared to the NEUTRAL(0) group

In [8]:
def split_trn_val_tst(df, random_state):
    # Separate dataframe into 2 classes as 1 and -1 to stratify sampling by class
    df_pos = df[(df['class']==1)]
    df_neg = df[(df['class']==-1)]
    
    # Splitting into 3 dataset: train, validate, and test
    # Straitify sampling by class
    pos_train, pos_validate, pos_test = np.split(df_pos.sample(frac=1, random_state = random_state), 
                                                 [int(0.6*len(df_pos)), int(0.8*len(df_pos))])
    neg_train, neg_validate, neg_test = np.split(df_neg.sample(frac=1, random_state = random_state), 
                                                 [int(0.6*len(df_neg)), int(0.8*len(df_neg))])
    
    # Append positive and negaive classes for each dataset
    df_train = pd.concat([pos_train, neg_train])
    df_val = pd.concat([pos_validate, neg_validate])
    df_test = pd.concat([pos_test, neg_test])
    
    # Create X_train, X_val, X_test, y_train, y_val, y_test for using in analysis
    X_train = df_train.loc[:,'comment_eng']
    X_val = df_val.loc[:,'comment_eng']
    X_test = df_test.loc[:,'comment_eng']
    y_train = df_train.iloc[:,-2]
    y_val = df_val.iloc[:,-2]
    y_test = df_test.iloc[:,-2]
    
    print('y values: {}'.format(list(y_train.unique())))
    print('Size of training dataset: {}, stratified to {}'.format(X_train.shape[0],Counter(y_train)))
    print('Size of validating dataset: {}, stratified to {}'.format(X_val.shape[0], Counter(y_val)))
    print('Size of test dataset: {}, stratified to {}'.format(X_test.shape[0], Counter(y_test)))
    return df_train, df_val, df_test, X_train, X_val, X_test, y_train, y_val, y_test

df_train, df_val, df_test, comment_train, comment_val, comment_test, y_train, y_val, y_test = split_trn_val_tst(df, random_state=30)

y values: [1, -1]
Size of training dataset: 355, stratified to Counter({1: 285, -1: 70})
Size of validating dataset: 119, stratified to Counter({1: 95, -1: 24})
Size of test dataset: 119, stratified to Counter({1: 95, -1: 24})


## __Descriptive analysis for training dataset__

In [9]:
# Function for counting number of numeric in a comment
def count_numeric(comment):
    n_numeric = len(re.findall(r'[0-9]', comment))
    return n_numeric

# Function for counting number of characters in a comment
def count_character(comment):
    n_char = len(re.findall(r'[a-zA-Z]', comment))
    return n_char

# Function for counting number of words in a comment which are splitted by space
def count_word(comment):
    n_word = len(comment.split())
    return n_word

# Function for counting number of uppercase words in a comment which are splitted by space
def count_uppercase(comment):
    n_uppercase = sum([int(1) if x.isupper()==True else int(0) for x in str(comment).split()])
    return n_uppercase

# Function for counting number of stop words in a comment which are splitted by space
# The stop words list comes from the combination of 2 libraries: stop_words and nltk.corpus
def count_stopword(comment):
    l1 = set(get_stop_words('en'))
    l2 = set(stopwords.words('english'))
    stop_list = set(l1.union(l2))
    n_stopword = sum([int(1) if x in stop_list else int(0) for x in str(comment).split()])
    return n_stopword, stop_list

# Function for counting number of negation in dataset
def negation(comment):
    neg_lists = []
    negations = []
    for j in list(comment):
        lower = str(j).lower().split()
        mark_neg = mark_negation(lower)
        di = dict(zip(mark_neg,lower))
        neg_list = [v for k,v in di.items() if '_NEG' in k]
        neg_lists.extend(neg_list)
        negation = [str(mark_neg).count('_NEG')]
        negations.extend(negation)
    count_neg_list = dict(zip(comment.index,negations))
    return neg_lists, count_neg_list, negations

# Function for counting number of puntuation in a comment
def count_punctuation(comment):
    punc_list = set(string.punctuation)
    n_punc = sum([int(1) if x in punc_list else int(0) for x in str(comment).split()])
    return n_punc, punc_list

# Function for descriptive analysis:
#     summation of number of numeric, characters, words, 
#     uppercase words, stop words, punctuation, and negation
def descriptive_analysis(comment):
    n_numerics = 0
    n_chars = 0
    n_words = 0
    n_uppercases = 0
    n_stopwords = 0
    _, _, negations = negation(comment)
    n_puncs = 0
    n_negations = sum(negations)
    for i in list(comment):
        n_numerics = n_numerics + count_numeric(i)
        n_chars = n_chars + count_character(i)
        n_words = n_words + count_word(i)
        n_uppercases = n_uppercases + count_uppercase(i)
        n_stopword, _ = count_stopword(i)
        n_stopwords = n_stopwords + n_stopword
        n_punc, _ = count_punctuation(i)
        n_puncs = n_puncs + n_punc
    print('Descriptive analysis of training dataset')
    print('Size = {:,} comments'.format(len(comment)))
    print('Numer of numeric = {:,}'.format(n_numerics))
    print('Numer of characters = {:,}'.format(n_chars))
    print('Numer of words = {:,}'.format(n_words))
    print('Numer of uppercases = {:,}'.format(n_uppercases))
    print('Numer of stopwords = {:,}'.format(n_stopwords))
    print('Numer of negations = {:,}'.format(n_negations))  
    print('Numer of punctuations = {:,}'.format(n_puncs))
    
# Descriptive analysis of training dataset
descriptive_analysis(comment = comment_train)

Descriptive analysis of training dataset
Size = 355 comments
Numer of numeric = 419
Numer of characters = 68,638
Numer of words = 15,721
Numer of uppercases = 202
Numer of stopwords = 6,083
Numer of negations = 6,775
Numer of punctuations = 23


## __Data Preprocessing__

### 4.1 Removing stopwords and punctuations

In [10]:
# Convert to lower case 
def lower(comment):
    text = comment.str.lower()
    return text

# Remove stopwords
def del_stopword(text):
    _, stop_list = count_stopword(text)
    stopword_list = list(stop_list)
    text_del_stopword = []
    for sentence in text:
        resultwords = [word for word in sentence.split() if word not in stopword_list]
        result = ' '.join(resultwords)
        text_del_stopword.append(result)
    return text_del_stopword

# Remove punctuations
def del_punc(text, text_del_stopword):
    _, punc_list = count_punctuation(text)
    punc_list = list(punc_list)
    text_del_punc = []
    for sentence in text_del_stopword:
        text_del = []
        for char in sentence:
            resultwords = [word for word in char if word not in punc_list]
            text_del.extend(resultwords)
        text_del_punc.extend([''.join(text_del)])
    return text_del_punc

# Expand contracted words
def expand_contract(text_del_punc):
    CONTRACTION_MAP = contractions.contractions_dict
    expanded = list(pd.Series(text_del_punc).replace(to_replace = CONTRACTION_MAP, regex=True))
    return expanded

# Cleaning text
def clean_text(comment):
    text = lower(comment)
    text_del_stopword = del_stopword(text)
    text_del_punc = del_punc(text, text_del_stopword)
    cleaned_text = expand_contract(text_del_punc)
    return cleaned_text

cleaned_train = clean_text(comment_train)
cleaned_val = clean_text(comment_val)
cleaned_test = clean_text(comment_test)

In [11]:
# Example comment_train vs cleaned_train
print(comment_train.iloc[5])
print(cleaned_train[5])

Modern medical equipment, doctors, nurses perform treatment with intent. Excellent service
modern medical equipment doctors nurses perform treatment intent excellent service


### 4.2 Word tokenization

In [12]:
# Set num_words=5000 to limit the maximum of tokenized words
tokenizer = Tokenizer(num_words=5000)

# Fit on text of training data set
tokenizer.fit_on_texts(cleaned_train)

# Tokenize all training, validation, and test data set
token_train = tokenizer.texts_to_sequences(cleaned_train)
token_val = tokenizer.texts_to_sequences(cleaned_val)
token_test = tokenizer.texts_to_sequences(cleaned_test)

# # Get index of word
word_index = tokenizer.word_index

In [13]:
# # Display one of token_train
print(cleaned_train[5])
print(token_train[5])

modern medical equipment doctors nurses perform treatment intent excellent service
[121, 33, 302, 13, 10, 460, 9, 614, 122, 4]


In [14]:
# Display size of vocab
vocab_size = len(tokenizer.word_index) + 1
print('Number of vocabs: {:,}'.format(vocab_size))

Number of vocabs: 2,127


### 4.3 Padding sequence

In [15]:
# Limit number of word sequences to avoid problem of different length of words by prepending or appending zeros
maxlen = len(max(token_train, key=len))

X_train = pad_sequences(token_train, padding='post', maxlen=maxlen)
X_val = pad_sequences(token_val, padding='post', maxlen=maxlen)
X_test = pad_sequences(token_test, padding='post', maxlen=maxlen)

print(X_train[5,])
print('Maximum sequence of tokenized word: {}'.format(maxlen))

[121  33 302  13  10 460   9 614 122   4   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   

In [16]:
# Data preparation for Sentiment Analysis
df_sentiment = df
X_comment = df_sentiment.loc[:,'comment_eng']
y = df_sentiment.iloc[:,-2]
cleaned = clean_text(X_comment)
token = tokenizer.texts_to_sequences(cleaned)
X = pad_sequences(token, padding='post', maxlen=maxlen)

In [17]:
# Save numpy array for using in data modeling section
file_name_dict = {'df_train': df_train, 'df_val':df_val, 'df_test':df_test, # Training, validation, and test dataframe without cleaning data
                  'cleaned_train': cleaned_train, 'cleaned_val': cleaned_val, 'cleaned_test': cleaned_test, # Cleaned data by removing stopwords and punctuations
                  'token_train': token_train, 'token_val': token_val, 'token_test': token_test, # Tokenized word
                  'X_train': X_train, 'X_val': X_val, 'X_test': X_test, # Prepared data for use as training, validation, and test dataset
                  'y_train': y_train, 'y_val': y_val, 'y_test': y_test,
                  'word_index': word_index, # Index of each word after tokenization
                  'vocab_size': vocab_size, # Number of vocab
                  'df_sentiment':df_sentiment,'X_comment':X_comment,'cleaned':cleaned, 'token':token, 'X':X, 'y':y # Prepared data for sentiment analysis
                 }

utils.save_np(data_folder, file_name_dict)