# Text representation

In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt') #tokenize
nltk.download('wordnet') #lemmatize
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger') #POS part of speech
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/menghsuanlee/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/menghsuanlee/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/menghsuanlee/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/menghsuanlee/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/menghsuanlee/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [22]:
# import data
col_name = ['id', 'review']
df = pd.read_csv('IA2.csv', names=col_name)

In [11]:
rev_lst = df['review'].tolist()

## 1. Tokenize each review in the collection

In [20]:
tokens = df['review'].apply(nltk.word_tokenize)

In [24]:
print(tokens)

0     [This, is, the, place, to, go, after, this, br...
1     [The, direct, delivery, from, Star, Theater, i...
2     [It, is, very, delicious, food, ., Good, taste...
3     [This, house, is, the, second, time, ., I, use...
4     [Featured, is, majestic, ., Located, on, the, ...
                            ...                        
95    [Perhaps, ,, the, old, people, in, this, film,...
96    [There, are, many, handsome, guys, ,, many, be...
97    [The, first, half, is, still, okay, ,, it, see...
98    [Like, most, of, the, mango-produced, thunder,...
99    [Some, of, Liu, Zhenyun, ’, s, works, have, be...
Name: review, Length: 100, dtype: object


## 2. Lemmatize all the words

In [27]:
lemmatizer = nltk.stem.WordNetLemmatizer()
lmt = []

for review in tokens:
    temp = []
    for token in review:
        if token.isalpha():
            temp.append(lemmatizer.lemmatize(token))
    lmt.append(temp)
print(lmt)

[['This', 'is', 'the', 'place', 'to', 'go', 'after', 'this', 'branch', 'to', 'the', 'afternoon', 'not', 'much', 'people', 'Now', 'the', 'popular', 'menu', 'is', 'ice', 'crystal', 'Hes', 'not', 'long', 'Feel', 'very', 'sorry', 'T', 'T', 'This', 'is', 'a', 'full', 'set', 'of', 'Chocolate', 'Mud', 'Brownie', 'Chocolate', 'Brownie', 'Sprinkle', 'with', 'vanilla', 'ice', 'cream', 'and', 'whipped', 'cream', 'This', 'is', 'a', 'very', 'hot', 'menu', 'Go', 'to', 'Ferrero', 'Honey', 'Toast', 'Honey', 'Toro', 'topped', 'with', 'nut', 'Topping', 'up', 'is', 'Ferrero', 'Fan', 'club', 'We', 'are', 'This', 'is', 'a', 'good', 'service', 'It', 's', 'a', 'little', 'car', 'park', 'If', 'the', 'parking', 'lot', 'of', 'the', 'building', 'will', 'be', 'stamped', 'the', 'first', 'hr', 'baht', 'sit', 'a', 'long', 'time', 'But', 'if', 'you', 'park', 'other', 'building', 'it', 'will', 'be', 'But', 'it', 's', 'just', 'this', 'parking', 'lot', 'is', 'already', 'low'], ['The', 'direct', 'delivery', 'from', 'Star'

## 3. Remove all the stop‐words and the punctuations 

In [29]:
# append token to list if the token is not a stopword
stopwords_removed = []
for review in lmt:
    tmp = []
    for token in review:
        if not token in stopwords.words('english'):
            tmp.append(token)
    stopwords_removed.append(tmp)

print(stopwords_removed)

[['This', 'place', 'go', 'branch', 'afternoon', 'much', 'people', 'Now', 'popular', 'menu', 'ice', 'crystal', 'Hes', 'long', 'Feel', 'sorry', 'T', 'T', 'This', 'full', 'set', 'Chocolate', 'Mud', 'Brownie', 'Chocolate', 'Brownie', 'Sprinkle', 'vanilla', 'ice', 'cream', 'whipped', 'cream', 'This', 'hot', 'menu', 'Go', 'Ferrero', 'Honey', 'Toast', 'Honey', 'Toro', 'topped', 'nut', 'Topping', 'Ferrero', 'Fan', 'club', 'We', 'This', 'good', 'service', 'It', 'little', 'car', 'park', 'If', 'parking', 'lot', 'building', 'stamped', 'first', 'hr', 'baht', 'sit', 'long', 'time', 'But', 'park', 'building', 'But', 'parking', 'lot', 'already', 'low'], ['The', 'direct', 'delivery', 'Star', 'Theater', 'Chiang', 'Mai', 'I', 'guarantee', 'really', 'cool', 'There', 'variety', 'flavor', 'choose', 'baht', 'per', 'piece', 'taste', 'choose', 'buy', 'buy'], ['It', 'delicious', 'food', 'Good', 'taste', 'Unspoilt', 'For', 'people', 'like', 'eat', 'spicy', 'much', 'cooked', 'The', 'shop', 'ha', 'prepared', 'Chil

## 4. Based on the output in step 3, convert each of the reviews in TD‐IDF vectors. The minimal document frequency for each term is 3. Also, include 2‐gram; 

In [41]:
# concat tokens in step 3 into a string for each review
new_corpus = []
for review in stopwords_removed:
    a = ' '.join(review)
    new_corpus.append(a)

In [42]:
tfidf = TfidfVectorizer(ngram_range = (1,2), min_df = 3 )
tfidf.fit(new_corpus)

TfidfVectorizer(min_df=3, ngram_range=(1, 2))

In [137]:
# covert each review into tfidf
vector = tfidf.transform(new_corpus)
print(vector.toarray())

[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.09655085 0.         0.03849301 ... 0.         0.07698601 0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [138]:
# dimesion of the vector
vector.toarray().shape

(100, 1442)

In [139]:
# to csv
arr = vector.toarray()
pd.DataFrame(arr).to_csv('IA2_p1_step4.csv')  

## 5. Based on the output in step 1, POS‐tag each word and do a TD‐IDF vectorization, the minimal document frequency for each term is 4

In [49]:
# create 'word_POStag'
pos = [] # store the tagged doc
for review in tokens:
    pos_token_doc = nltk.pos_tag(review)
    temp = [] # store the token concat POS tag
    for i in pos_token_doc:
        temp.append(i[0]+'_'+i[1])
    pos.append(" ".join(temp)) # turn into a doc, use space the separate
print(pos)



In [56]:
# TFIDF
tfidf2 = TfidfVectorizer(min_df = 4)
tfidf2.fit(pos)
vector2 = tfidf2.transform(pos)
print(vec.toarray())

[[0.         0.06519163 0.         ... 0.         0.         0.        ]
 [0.         0.         0.26085303 ... 0.         0.2389417  0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.12111251 0.         0.        ]
 [0.         0.         0.         ... 0.01350436 0.01781165 0.        ]
 [0.         0.         0.07705607 ... 0.         0.         0.        ]]


In [57]:
# dimesion of the vector
vector2.toarray().shape

(100, 906)

In [136]:
# to csv
arr = vector2.toarray()
pd.DataFrame(arr).to_csv('IA2_p1_step5.csv')  