In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
# data preprocessing functions
def word_lemmatisation(tagged_token):
    token = tagged_token[0]
    tag = tagged_token[1]
    wnl = nltk.stem.WordNetLemmatizer()
    wn = nltk.corpus.wordnet
    tag_dict = {'J':wn.ADJ,'R':wn.ADV,'N':wn.NOUN,'V':wn.VERB}
    if tag[0] in tag_dict.keys():
        return wnl.lemmatize(token, tag_dict[tag[0]])# Remove punctuation
    else:
        return token
def pre_processing(line):
    tokenizer = nltk.RegexpTokenizer(r"[a-zA-Z]+")# Remove numbers
    tokens = tokenizer.tokenize(line.lower())# Lowercase
    tagged_tokens = nltk.pos_tag(tokens)# Part of speech tagging
    lemmatized_tokens = [word_lemmatisation(tagged_token) for tagged_token in tagged_tokens]# Remove punctuation
    sw = nltk.corpus.stopwords.words("english")
    res = ' '.join(list(filter(lambda x:x not in sw,lemmatized_tokens)))# Remove stop words
    if len(res):
        return res
    else:
        return None

In [3]:
data_name = "dataCell"

In [4]:
# Load data
df = pd.read_json("./Data/"+data_name+".json")
selected_df = pd.DataFrame()
selected_df["reviews"] = df.loc[:,"reviewText"]
selected_df["sentiments"] = df.loc[:,"overall"]

In [5]:
pos_df = selected_df[selected_df["sentiments"]>3].copy()
pos_df["sentiments"] = 1
neg_df = selected_df[selected_df["sentiments"]<3].copy()
neg_df["sentiments"] = 0

In [6]:
mixed_df = pd.concat([pos_df,neg_df]).sample(frac=1, random_state=666).reset_index(drop=True)

In [7]:
# Do pre-processing
mixed_df["reviews"] = mixed_df.loc[:,"reviews"].map(pre_processing)# Do pre-processing
mixed_df = mixed_df.dropna()
mixed_df["reviews"] = mixed_df["reviews"].astype(str)
mixed_df.to_csv("./Data/Processed_"+data_name+".csv" ,index=False)# save pre-processed data as csv