In [1]:
import pandas as pd
import numpy as np
import nltk
from tqdm import tqdm
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
nltk.download('punkt')
nltk.download('stopwords')
stopwords = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\AJL\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AJL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# MUST CHANGE ACCORDINGLY!
GLOVE_LOCATION = r'D:\Downloads\glove.twitter.27B\glove.twitter.27B.200d.txt'

In [4]:
def read_glove_vecs(glove_file):
    with open(glove_file, 'r', encoding="utf8") as f:
        words = set()
        word_to_vec_map = {}
        for line in f:
            line = line.strip().split()
            curr_word = line[0]
            words.add(curr_word)
            word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
        
        i = 1
        words_to_index = {}
        index_to_words = {}
        for w in sorted(words):
            words_to_index[w] = i
            index_to_words[i] = w
            i = i + 1
    return words_to_index, index_to_words, word_to_vec_map

In [5]:
def sentence_to_avg(sentence, word_to_vec_map):
    words = sentence.split()
    avg = np.zeros(200) # depends on glove
    num_valid_words = 0
    for w in words:
        if w in word_to_vec_map:
            avg += word_to_vec_map[w]
            num_valid_words += 1
    if num_valid_words == 0:
        return np.zeros(200)
    else:
        return avg / num_valid_words

In [6]:
df = pd.read_csv("reviews.csv") 
cleaned_reviews = []
reviews = df['text']
tokenizer = nltk.tokenize.TreebankWordTokenizer()
for review in tqdm(reviews):
    sentences = nltk.tokenize.sent_tokenize(review)
    tokens = []
    for sentence in sentences:
        for token in tokenizer.tokenize(sentence):
            token = token.lower()
            if token.isalpha() and not token in stopwords:
                tokens.append(token)
    joined = " ".join(tokens)
    joined = joined.replace('\n', ' ').replace('\r', '')
    cleaned_reviews.append(joined)
word_to_index, index_to_word, word_to_vec_map = read_glove_vecs(GLOVE_LOCATION)
X = np.array([sentence_to_avg(x, word_to_vec_map) for x in tqdm(cleaned_reviews)])
X = np.nan_to_num(X)
Y = np.array(df['stars']).astype('int') - 1 # change to 0-4

100%|██████████| 15300/15300 [00:14<00:00, 1078.45it/s]
100%|██████████| 15300/15300 [00:01<00:00, 10241.91it/s]


In [8]:
vectorizer = TfidfVectorizer()
X_tfid = vectorizer.fit_transform(cleaned_reviews)

In [11]:
np.save('X.npy', X)
np.save('X_tfid.npy', X_tfid)
np.save('Y.npy', Y)