# Data pre-processing

In [1]:
import pandas as pd
import re
import nltk

## Import .csv file

In [2]:
dataSetRaw = pd.read_csv('../Kaggle-dataset/raw/20191226-reviews.csv',encoding="latin1")
dataSetRaw

Unnamed: 0,asin,name,rating,date,verified,title,body,helpfulVotes
0,B0000SX2UC,Janet,3,"October 11, 2005",False,"Def not best, but not worst",I had the Samsung A600 for awhile which is abs...,1.0
1,B0000SX2UC,Luke Wyatt,1,"January 7, 2004",False,Text Messaging Doesn't Work,Due to a software issue between Nokia and Spri...,17.0
2,B0000SX2UC,Brooke,5,"December 30, 2003",False,Love This Phone,"This is a great, reliable phone. I also purcha...",5.0
3,B0000SX2UC,amy m. teague,3,"March 18, 2004",False,"Love the Phone, BUT...!","I love the phone and all, because I really did...",1.0
4,B0000SX2UC,tristazbimmer,4,"August 28, 2005",False,"Great phone service and options, lousy case!",The phone has been great for every purpose it ...,1.0
...,...,...,...,...,...,...,...,...
67981,B081H6STQQ,jande,5,"August 16, 2019",False,"Awesome Phone, but finger scanner is a big mis...",I love the camera on this phone. The screen is...,1.0
67982,B081H6STQQ,2cool4u,5,"September 14, 2019",False,Simply Amazing!,I've been an Xperia user for several years and...,1.0
67983,B081H6STQQ,simon,5,"July 14, 2019",False,"great phon3, but many bugs need to fix. still ...",buy one more for my cousin,
67984,B081TJFVCJ,Tobiasz Jedrysiak,5,"December 24, 2019",True,Phone is like new,Product looks and works like new. Very much re...,


## DataSet Clean

### Drop unrelevant features

In [3]:
unrelevant_features = ["asin","name","date","verified","helpfulVotes"]
dataSetRaw.drop(unrelevant_features,inplace=True,axis=1)
dataSetRaw

Unnamed: 0,rating,title,body
0,3,"Def not best, but not worst",I had the Samsung A600 for awhile which is abs...
1,1,Text Messaging Doesn't Work,Due to a software issue between Nokia and Spri...
2,5,Love This Phone,"This is a great, reliable phone. I also purcha..."
3,3,"Love the Phone, BUT...!","I love the phone and all, because I really did..."
4,4,"Great phone service and options, lousy case!",The phone has been great for every purpose it ...
...,...,...,...
67981,5,"Awesome Phone, but finger scanner is a big mis...",I love the camera on this phone. The screen is...
67982,5,Simply Amazing!,I've been an Xperia user for several years and...
67983,5,"great phon3, but many bugs need to fix. still ...",buy one more for my cousin
67984,5,Phone is like new,Product looks and works like new. Very much re...


### Process text

In [4]:
nltk.download('wordnet')
nltk.download('punkt_tab')
nltk.download('stopwords')
stopWords = nltk.corpus.stopwords.words("english")

def cleanText(text):
    aux = str(text)

    #Only retain number and letters
    aux = re.sub("[^a-zA-Z0-9]"," ",aux)
    #Change all to lower case
    aux = aux.lower()

    # Tokenizing and lemmatizing
        # https://www.nltk.org/api/nltk.tokenize.html
        # https://www.nltk.org/api/nltk.stem.WordNetLemmatizer.html?highlight=wordnet
    aux = nltk.word_tokenize(aux)
    aux = [nltk.stem.WordNetLemmatizer().lemmatize(word) for word in aux]
    #Remove Stop words
    aux = [word for word in aux if word not in stopWords]

    return " ".join(aux)

print("Test String: ", cleanText("aldjodjo!ASASSAokok!"))

[nltk_data] Downloading package wordnet to /Users/luism/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /Users/luism/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/luism/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Test String:  aldjodjo asassaokok


In [5]:
dataSetRaw['textFull'] = dataSetRaw.apply(lambda x: cleanText(x['title']) + " " + cleanText(x['body']), axis=1)
print(dataSetRaw)
dataSetRaw.drop(['title', 'body'],inplace=True,axis=1)


       rating                                              title  \
0           3                        Def not best, but not worst   
1           1                        Text Messaging Doesn't Work   
2           5                                    Love This Phone   
3           3                            Love the Phone, BUT...!   
4           4       Great phone service and options, lousy case!   
...       ...                                                ...   
67981       5  Awesome Phone, but finger scanner is a big mis...   
67982       5                                    Simply Amazing!   
67983       5  great phon3, but many bugs need to fix. still ...   
67984       5                                  Phone is like new   
67985       5                    Outstanding phone for the price   

                                                    body  \
0      I had the Samsung A600 for awhile which is abs...   
1      Due to a software issue between Nokia and Spri...   
2  

## Split and save

In [13]:
from sklearn.model_selection import train_test_split

trainDataset, valDataset = train_test_split(dataSetRaw,test_size=0.2, train_size=0.8)
valDataset, testDataset = train_test_split(valDataset,test_size = 0.40,train_size =0.60)

In [15]:
trainDataset.to_csv("../Kaggle-dataset/pre-processed/trainDataset.csv")
valDataset.to_csv("../Kaggle-dataset/pre-processed/valDataset.csv")
testDataset.to_csv("../Kaggle-dataset/pre-processed/testDataset.csv")