In [None]:
!pip install sister

In [None]:
import preprocessing
import pandas as pd
import sister
import time

## Preprocessing

### 1. Preprocessing with only tokenization and lemmatization

In [None]:
# Preprocess whole dataset
df = pd.read_csv("../data/Hotel_reviews_features_selected.csv")

dict = {
    "token": True, #mandatory True
    "token_sentenceSeperate":False,
    "token_includePunctation":False,
    "token_excludeSpecPuct" :[],
    "remStpwrds": False,
    "stemm": False,
    "lemmatize": True,
    "nGram": False,
    "nGram_length":2
}
pre_processed_data_path = "../data/preprocessed/feature_generated/fasttext/fast_text_"+"_".join(str(key) + str(value) for key, value in dict.items())+".csv"
df["Review"] = df["Review"].apply(lambda review: preprocessing.preprocess(review,dict))
df.to_csv(pre_processed_data_path)



In [None]:
# Preprocess 1000 from dataset
df = pd.read_csv("../data/Hotel_reviews_features_selected.csv")
df = df.head(1000)
dict = {
    "token": True, #mandatory True
    "token_sentenceSeperate":False,
    "token_includePunctation":False,
    "token_excludeSpecPuct" :[],
    "remStpwrds": False,
    "stemm": False,
    "lemmatize": True,
    "nGram": False,
    "nGram_length":2
}
pre_processed_limit_data_path = "../data/preprocessed/feature_generated/fasttext/fast_text_limit_"+"_".join(str(key) + str(value) for key, value in dict.items())+".csv"
df["Review"] = df["Review"].apply(lambda review: preprocessing.preprocess(review,dict))
df.to_csv(pre_processed_limit_data_path)



### 2. Preprocessing with only tokenization, stop-word removal and lemmatization

In [None]:
# Preprocess whole dataset
df = pd.read_csv("../data/Hotel_reviews_features_selected.csv")

dict = {
    "token": True, #mandatory True
    "token_sentenceSeperate":False,
    "token_includePunctation":False,
    "token_excludeSpecPuct" :[],
    "remStpwrds": True,
    "stemm": False,
    "lemmatize": True,
    "nGram": False,
    "nGram_length":2
}
pre_processed_swr_data_path = "../data/preprocessed/feature_generated/fasttext/fast_text_"+"_".join(str(key) + str(value) for key, value in dict.items())+".csv"
df["Review"] = df["Review"].apply(lambda review: preprocessing.preprocess(review,dict))
df.to_csv(pre_processed_swr_data_path)

In [None]:
# Preprocess 1000 from dataset
df = pd.read_csv("../data/Hotel_reviews_features_selected.csv")
df = df.head(1000)
dict = {
    "token": True, #mandatory True
    "token_sentenceSeperate":False,
    "token_includePunctation":False,
    "token_excludeSpecPuct" :[],
    "remStpwrds": True,
    "stemm": False,
    "lemmatize": True,
    "nGram": False,
    "nGram_length":2
}
pre_processed_swr_limit_data_path = "../data/preprocessed/feature_generated/fasttext/fast_text_limit_"+"_".join(str(key) + str(value) for key, value in dict.items())+".csv"
df["Review"] = df["Review"].apply(lambda review: preprocessing.preprocess(review,dict))
df.to_csv(pre_processed_swr_limit_data_path)

## Test sister library
Sister provides an embedder that generates feature vectors for text.

In [None]:
# Test sister library
embedder = sister.MeanEmbedding(lang="en")

test_word = ["pizza", "is", "like", "my", "familiy"]
vector = embedder(" ".join([x for x in test_word])  )
print(vector)


## Word to vector approach
We tested an approach which generates a vector for each token in a review.
Due to the high storage and computing costs we decided to generate a vector for each review.

In [None]:
# A general function that converts a word list to a list of embedding vectors with the parameter embedder
def generate_vector_list(wordList, embedder):
    result_vector_list = [] 
    for word in wordList:
        embedding = embedder(word)
        result_vector_list.append(embedding)
    return result_vector_list

In [None]:
# A try with limited data of 1000 to generate the feature took 30 minutes and 1 GB storage
embedder = sister.MeanEmbedding(lang="en")
df = pd.read_csv(pre_processed_limit_data_path)
start = time.process_time()
#df["Review"] = df["Review"].apply(lambda review: generate_vector_list(review, embedder))
#df.to_csv("../data/preprocessed/feature_generated/fasttext/fast_text_limit_unigram_features.csv")
print(time.process_time() - start)


## Review to vector approach
This is the actual feature generation where we generated one vector for each review.
Resulting in circa 2 GB of data in 40 minutes

### 1. Preprocessing (tokenization, lemmatization)

In [None]:
embedder = sister.MeanEmbedding(lang="en")
df = pd.read_csv(pre_processed_data_path)
start = time.process_time()
df["Review"] = df["Review"].apply(lambda review: embedder(" ".join([x for x in review])))
df.to_csv("../data/preprocessed/feature_generated/fasttext/fast_text_nonswr_features.csv")
display(df)
print(time.process_time() - start)



### 2. Preprocessing (tokenization, lemmatization, stop-word removal)

In [None]:
embedder = sister.MeanEmbedding(lang="en")
df = pd.read_csv(pre_processed_swr_data_path)
start = time.process_time()
df["Review"] = df["Review"].apply(lambda review: embedder(" ".join([x for x in review])))
df.to_csv("../data/preprocessed/feature_generated/fasttext/fast_text_swr_features.csv")
display(df)
print(time.process_time() - start)
