In [1]:
!pip install --upgrade pip

Requirement already up-to-date: pip in /usr/local/lib/python3.6/dist-packages (20.0.2)


In [2]:
!pip install pandas



In [3]:
!pip install nltk



In [4]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
# Load the libraries
import pandas as pd
import numpy as np
import re
from multiprocessing import  Pool

# Import from nltk
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import bigrams, FreqDist

## 1 Load in the data and create the required structure

In [6]:
# Load in the data
load_path = '../Datasets/yelp/'
labelled_data = pd.read_csv(load_path + 'labeled_data.csv')
test_data = pd.read_csv(load_path + 'test_data.csv')
unlabelled_data = pd.read_csv(load_path + 'unlabeled_data.csv')

In [7]:
# Add label of 0 to all unlabeled data
unlabelled_data['label'] = 0
test_data['label'] = 6

# Have a look at the first 5 rows
unlabelled_data.head(n=5)

Unnamed: 0,text,label
0,Had a good experience when my wife and I sat a...,0
1,On my first to Montreal with my gf we came her...,0
2,One of our favorite places to go when it's col...,0
3,"The doctor was very nice, got in in a good amo...",0
4,The Nook is an immediate phoenix staple! I ca...,0


In [8]:
# Join the dataframes
all_data = labelled_data.append(test_data)
all_data = all_data.append(unlabelled_data)

# Look at the length of all_data
print('Length of dataframe: ', len(all_data))

Length of dataframe:  700000


## 2 Tokenize

#### Get stop words

In [9]:
# Have a look at the stop words
stop_words = stopwords.words('english') # NLTK stop_words
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [10]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
# Remove certain words from the list of stop words
words_to_remove = ["not", "too", "very", "don", "don't", "should", "should've", "aren", "aren't", 
                   "couldn", "couldn't", "didn", "didn't", "doesn", "doesn't", "hadn", "hadn't", 
                   "hasn", "hasn't", "haven", "haven't", "isn", "isn't", "shouldn", "shouldn't", 
                   "wasn", "wasn't", "weren", "weren't", "won", "won't", "wouldn", "wouldn't", 
                   "shan", "shan't", "mightn", "mightn't", "mustn", "mustn't", "needn", "needn't"]
for word in words_to_remove:
    stop_words.remove(word)

In [12]:
# Have a look at the stop words
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'only', 'own', 'same', 'so', 'than', 's', 't'

#### Create the tokenizer

In [13]:
# Set the lemmatizer
lemmatizer = WordNetLemmatizer()

# Create the tokenizer
def tokenize(review):
    review = review.lower() # make everything lower case
    tokens = word_tokenize(review) # use NLTK tokenizer to generate tokens
    tokens = [token for token in tokens if bool(re.search(r'\w{1,}', token))] # remove tokens that don't have letters or numbers in them
    tokens = [token for token in tokens if not token in stop_words] # remove stop words
    tokens = [lemmatizer.lemmatize(token) for token in tokens] # return each token in their base form
    
    return ' '.join(tokens) # return the review tokenized review as a string

#### Tokenize the reviews

In [14]:
# Create parallelization function
def parallelize_dataframe(df, func, n_cores=12):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [15]:
# Create function to convert strings to arrays
def prallel_tokenize(df):
    df['text'] = df['text'].apply(tokenize)
    return df

In [16]:
# Tokenize the reviews
all_data = parallelize_dataframe(all_data, prallel_tokenize)

In [17]:
# Have a look at the top 3 rows
all_data.head(n=3)

Unnamed: 0,label,text
0,4,new rule waiting table almost always cant wait...
1,3,flirted giving two star 's pretty damning rati...
2,5,staying planet hollywood across street saw goo...


## 3 Add ngrams

In [18]:
# Create a new dataframe
data_with_ngrams = all_data.copy()

# Replace non-string reviews with strings
data_with_ngrams['text'] = data_with_ngrams['text'].apply(lambda review: str(review))

# Create a dataframe for all the bigrams
bigrams_used = []

In [19]:
# Function to add bi-grams
def add_bi_grams(review):
    review = f' {review} '
    global bi_grams_to_use
    global bigrams_used
    for bi_gram in bi_grams_to_use:
        checkfor_bi_gram = f' {" ".join(bi_gram)} '
        if checkfor_bi_gram in review:
            review = review.replace(checkfor_bi_gram, f' {"".join(bi_gram)} ')
            bigrams_used.append(bi_gram)
    return(review[1:-1])

In [20]:
n = 200 # number of bigrams to be added in each itteration
times = 15 # number of itterations
for i in range(times):
    # Turn reviews into one long list of tokens
    reviews_one_long_list = (' '.join(list(data_with_ngrams['text']))).split(' ')

    # Print number of unique tokens
    print(f'{i + 1}. Unique tokens: {len(set(reviews_one_long_list))}')

    # Create a freq-dist of bi-grams
    fdist_bi_grams = FreqDist(bigrams(reviews_one_long_list))

    # Get the top n bi-grams
    top_n_bigrams = fdist_bi_grams.most_common(n)
    bi_grams_to_use = [bigram[0] for bigram in top_n_bigrams]

    # Print this step is done
    print(f'{i + 1}. Top {n} bigrams found')

    # Add the bi-grams to the text
    data_with_ngrams['text'] = data_with_ngrams['text'].apply(add_bi_grams)

    # Print finished
    print(f'{i + 1}. Bigrams added')
    
# Turn reviews into one long list
reviews_one_long_list = (' '.join(list(data_with_ngrams['text']))).split(' ')

# Print number of unique tokens
print(f'Final. Unique tokens: {len(set(reviews_one_long_list))}')

1. Unique tokens: 357816
1. Top 200 bigrams found
1. Bigrams added
2. Unique tokens: 357963
2. Top 200 bigrams found
2. Bigrams added
3. Unique tokens: 358117
3. Top 200 bigrams found
3. Bigrams added
4. Unique tokens: 358277
4. Top 200 bigrams found
4. Bigrams added
5. Unique tokens: 358441
5. Top 200 bigrams found
5. Bigrams added
6. Unique tokens: 358607
6. Top 200 bigrams found
6. Bigrams added
7. Unique tokens: 358790
7. Top 200 bigrams found
7. Bigrams added
8. Unique tokens: 358962
8. Top 200 bigrams found
8. Bigrams added
9. Unique tokens: 359141
9. Top 200 bigrams found
9. Bigrams added
10. Unique tokens: 359319
10. Top 200 bigrams found
10. Bigrams added
11. Unique tokens: 359502
11. Top 200 bigrams found
11. Bigrams added
12. Unique tokens: 359681
12. Top 200 bigrams found
12. Bigrams added
13. Unique tokens: 359865
13. Top 200 bigrams found
13. Bigrams added
14. Unique tokens: 360050
14. Top 200 bigrams found
14. Bigrams added
15. Unique tokens: 360234
15. Top 200 bigrams f

In [21]:
# Save the bigrams to csv
not_added_yet = set(bigrams_used)
unique_bigrams_used = []
for bi_gram in bigrams_used:
    try:
        if (bi_gram in not_added_yet):
            unique_bigrams_used.append(list(bi_gram))
            not_added_yet.remove(bi_gram)
    except:
        pass
unique_bigrams_used = pd.DataFrame(unique_bigrams_used, columns=[0,1])
unique_bigrams_used.to_csv(f'./bigrams.csv', index=False)

## 4 Remove unique tokens

In [22]:
# Get all the tokens and their counts
all_tokens = [review for review in data_with_ngrams['text']]
all_tokens = (' '.join(all_tokens)).split(' ')
tokens_count = FreqDist(all_tokens)

# Get number of all tokens
len(set(all_tokens))

360419

In [23]:
# Get the tokens that appear more than the cutoff amount
# Getting common tokens instead of rare tokens speeds up processing time
cut_off = 2
common_tokens = [token[0] for token in tokens_count.items() if token[1] > cut_off]

# Get number of common tokens
print(len(common_tokens))

# Turn it into a dict for faster processing
common_tokens_dict = {}
for token in common_tokens:
    common_tokens_dict[token] = token

103401


In [24]:
# Create function to remove unique tokens
def remove_rare_tokens(string):
    tokens_splitted = string.split(' ')
    tokens_unique_removed = []
    for token in tokens_splitted:
        try:
            tokens_unique_removed.append(common_tokens_dict[token])
        except:
            pass
    return(' '.join(tokens_unique_removed))

In [25]:
# Create a new dataframe
data_processed = data_with_ngrams.copy()

# Remove rare tokens from the reviews
data_processed['text'] = data_processed['text'].apply(remove_rare_tokens)

In [26]:
# Save the data with ngrams
save_path = '../Datasets/yelp/'
data_processed.to_csv(save_path + 'data_processed.csv', index=False)