# Data Preprocessing

The data is formatted as JSON separated by newline. Using pandas to read dataset

In [4]:
import pandas as pd
import matplotlib.pyplot as plt

dataframe = pd.read_json("./Software_5.json", lines=True)


The feature `vote` tells us if the review is helpful or not. To better classifying our data, we want to remove all reviews that have no vote

In [5]:
dataframe = dataframe[dataframe['vote'].notna()]


For feature `reviewText`, we want to do some pre processing steps on the text. Notably:

- Lemmetization  
- Expand contraction   
- Lower all words

In [36]:
from nltk.corpus import stopwords
import nltk
import string
import contractions


def remove_special_symbol(text: str) -> str:
    punctuationfree = "".join([i for i in text if i not in string.punctuation])
    return punctuationfree


def lower_text(text: str) -> str:
    return text.lower()


def remove_stop_words(text: str) -> str:
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    return ' '.join(
        [word for word in text.split() if word not in (stop_words)])


def expand_contraction(text: str) -> str:
    return contractions.fix(text)


def stemming(text: str) -> str:
    stemmer = nltk.stem.SnowballStemmer("english")
    return stemmer.stem(text)


dataframe["reviewText"].apply(lower_text).apply(expand_contraction).apply(
    remove_stop_words).apply(remove_special_symbol).apply(stemming)


I are not


Each review is assigned an overall score from 1.0 to 5.0. Convert those scores to labels where -1 is negative, 0 is neutral and 1 is positive

In [None]:
def sentiment_from_overall(number):
    if number < 3.0:
        return -1
    if number == 3.0:
        return 0
    return 1

dataframe["sentiment"] = dataframe["overall"].apply(sentiment_from_overall)

# Visualization

Exploratory data analysis on overall rating column

In [None]:
dataframe["overall"].describe()


Plotting pie chart and bar chart for overall score distribution. The 'overall' column has 5 labels: 1.0, 2.0, 3.0, 4.0, 5.0 where 1.0 is very bad, 3.0 is neutral and 5.0 is very good.

In [None]:
scores = dataframe["overall"].value_counts()
index = scores.index.to_numpy()
scores = scores.to_numpy()
plt.pie(scores, labels=index, autopct='%1.1f%%')
plt.title("Review distribution per rating")
plt.show()
dataframe['overall'].value_counts().plot(kind='bar',title="Review count per rating")


We get the same statistic, but this time with sentiment

In [None]:
def sentiment_to_word(number):
    if number == 1:
        return "Positive"
    if number == 0:
        return "Negative"
    return "Neutral"

scores = dataframe["sentiment"].value_counts()
index = [sentiment_to_word(x) for x in scores.index.to_numpy()]
scores = scores.to_numpy()
plt.pie(scores, labels=index, autopct='%1.1f%%')
plt.title("Review distribution per sentiment")
plt.show()
dataframe['sentiment'].value_counts().plot(kind='bar',title="Review count per sentiment")


def sentiment_to_word(number):
    if number == 0:
        return "Negative"
    if number == 1:
        return "Positive"
    return "Neutral"

scores = dataframe["sentiment"].value_counts()
index = scores.index.to_numpy().apply(sentiment_to_word)
scores = scores.to_numpy()

plt.pie(scores, labels=index, autopct='%1.1f%%')
plt.title("Review distribution per sentiment")
plt.show()

Now, we get statistics on reivewText length. To visualize this, we use a boxplot

In [None]:
text_length = dataframe["reviewText"].apply(len)
print("Statistic of text length\n", text_length.describe())
# To visualize this, we use boxplot
plt.boxplot(text_length, vert=False)
plt.show()
