In [9]:
# Importing Required Libraries
import pandas as pd
import re
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import ToktokTokenizer
from nltk.stem import WordNetLemmatizer

In [2]:
review_data = pd.read_csv("dataset/K8 Reviews v0.2.csv")

In [3]:
review_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14675 entries, 0 to 14674
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   sentiment  14675 non-null  int64 
 1   review     14675 non-null  object
dtypes: int64(1), object(1)
memory usage: 229.4+ KB


In [4]:
review_data.head(5)

Unnamed: 0,sentiment,review
0,1,Good but need updates and improvements
1,0,"Worst mobile i have bought ever, Battery is dr..."
2,1,when I will get my 10% cash back.... its alrea...
3,1,Good
4,0,The worst phone everThey have changed the last...


Preprocessing Steps:
1. Remove Punctuations
2. Remove Special Characters and numbers
3. Remove Emojis
4. Remove Reviews with one / two words
5. Remove Non-english reviews


In [22]:
# Handler Functions for Text Preprocessing

token = ToktokTokenizer()
lemma = WordNetLemmatizer()
nltk.download("wordnet")
punct = '!"#$%&\'()*+,./:;<=>?@[\\]^`{|}~0123456789'
stop_words = set(stopwords.words("english"))

def clean_text(text):
    # Clean Text
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub(r"\'\n", " ", text)
    text = re.sub(r"\r", " ", text)
    text = re.sub(r"<td>", " ", text)
    text = re.sub(r"</td>", " ", text)
    text = re.sub(r"<tr>", " ", text)
    text = re.sub(r"</tr>", " ", text)
    text = re.sub(r"\'\xa0", " ", text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

def remove_emoji(text):
    emoji_pattern = re.compile("["
                        u"\U0001F600-\U0001F64F"  # emoticons
                        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                        u"\U0001F680-\U0001F6FF"  # transport & map symbols
                        u"\U0001F1E0-\U0001F1FF"  # flags 
                        u"\U00002702-\U000027B0"
                        u"\U000024C2-\U0001F251"
                        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

def strip_list_noempty(mylist):
    newlist = (item.strip() if hasattr(item, 'strip') else item for item in mylist)
    return [item for item in newlist if item != '']

def clean_punct(text): 
    words=token.tokenize(text)
    punctuation_filtered = []
    regex = re.compile('[%s]' % re.escape(punct))
    remove_punctuation = str.maketrans(' ', ' ', punct)
    for w in words:
        punctuation_filtered.append(regex.sub('', w))   
    filtered_list = strip_list_noempty(punctuation_filtered)
    return ' '.join(map(str, filtered_list))

def lemitizeWords(text):
    words = token.tokenize(text)
    listLemma = []
    for w in words:
        x = lemma.lemmatize(w, pos="v")
        listLemma.append(x)
    return ' '.join(map(str, listLemma))

def stopWordsRemove(text):
    words = token.tokenize(text)
    filtered = [w for w in words if not w in stop_words]
    return ' '.join(map(str, filtered))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\muthu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
review_data.loc[:, 'review'] = review_data['review'].apply(lambda x: clean_text(x))

In [13]:
review_data.head()

Unnamed: 0,sentiment,review
0,1,good but need updates and improvements
1,0,"worst mobile i have bought ever, battery is dr..."
2,1,when i will get my 10% cash back.... its alrea...
3,1,good
4,0,the worst phone everthey have changed the last...


In [14]:
# Removing Emojis
review_data.loc[:, 'review'] = review_data['review'].apply(lambda x: remove_emoji(x))

In [16]:
review_data.head(52)

Unnamed: 0,sentiment,review
0,1,good but need updates and improvements
1,0,"worst mobile i have bought ever, battery is dr..."
2,1,when i will get my 10% cash back.... its alrea...
3,1,good
4,0,the worst phone everthey have changed the last...
5,0,only i am telling do not buyi am totally disap...
6,1,"phone is awesome. but while charging, it heats..."
7,0,the battery level has worn down
8,0,it over hitting problems...and phone hanging p...
9,0,a lot of glitches dont buy this thing better g...


In [17]:
# cleaning Punctuations
review_data.loc[:, 'review'] = review_data['review'].apply(lambda x: clean_punct(x))

In [18]:
review_data.head()

Unnamed: 0,sentiment,review
0,1,good but need updates and improvements
1,0,worst mobile i have bought ever battery is dra...
2,1,when i will get my cash back its already january
3,1,good
4,0,the worst phone everthey have changed the last...


In [19]:
# Lemmatization
review_data.loc[:, 'review'] = review_data['review'].apply(lambda x: lemitizeWords(x))

In [20]:
review_data.head()

Unnamed: 0,sentiment,review
0,1,good but need update and improvements
1,0,worst mobile i have buy ever battery be drain ...
2,1,when i will get my cash back its already january
3,1,good
4,0,the worst phone everthey have change the last ...


In [23]:
# Stop Words Removal
review_data.loc[:, 'review'] = review_data['review'].apply(lambda x: stopWordsRemove(x))

In [24]:
review_data.head()

Unnamed: 0,sentiment,review
0,1,good need update improvements
1,0,worst mobile buy ever battery drain like hell ...
2,1,get cash back already january
3,1,good
4,0,worst phone everthey change last phone problem...
