In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('../att+bestbuy.csv', index_col=0)

In [4]:
data.head()

Unnamed: 0,0,review_text
0,at&t,I’ve been with Apple since day 1 of 2007 and t...
1,at&t,"I went to this from a 6 and no matter what, tr..."
2,at&t,Love the phone and it’s upgrades however it is...
3,at&t,I was SUPER excite about getting this phone. P...
4,at&t,More hype than substance. While the facial rec...


In [5]:
data.iloc[:,0].value_counts()

bestbuy    5226
at&t        472
Name: 0, dtype: int64

In [6]:
# Remove ponctuation

In [7]:
matrix = str.maketrans(",\"", "  ", "'’.()/-?!")

In [8]:
data["review_text"] = data["review_text"].transform(lambda x: x.translate(matrix))

In [9]:
data.head()

Unnamed: 0,0,review_text
0,at&t,Ive been with Apple since day 1 of 2007 and th...
1,at&t,I went to this from a 6 and no matter what tr...
2,at&t,Love the phone and its upgrades however it is ...
3,at&t,I was SUPER excite about getting this phone Pr...
4,at&t,More hype than substance While the facial reco...


In [10]:
# Tokenize

In [11]:
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

tweet = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
stopwords = set(stopwords.words('english'))

In [12]:
data["review_text"] = data["review_text"].transform(tweet.tokenize)

In [13]:
# Remove stopwords

In [14]:
data["review_text"] = data["review_text"].transform(lambda x: [token for token in x if not token in stopwords])

In [15]:
# Checkpoint

In [16]:
data.groupby("0").head(10)

Unnamed: 0,0,review_text
0,at&t,"[ive, apple, since, day, 1, 2007, x, far, best..."
1,at&t,"[went, 6, matter, transfer, one, phone, anothe..."
2,at&t,"[love, phone, upgrades, however, fragile, diff..."
3,at&t,"[super, excite, getting, phone, preordered, wa..."
4,at&t,"[hype, substance, facial, recognition, system,..."
5,at&t,"[saw, first, ad, iphone, x, like, want, phone,..."
6,at&t,"[cant, understand, negative, reviews, owned, e..."
7,at&t,"[picked, x, release, date, worried, home, butt..."
8,at&t,"[durable, glass, ever, laugh, attached, pic, i..."
9,at&t,"[within, hour, getting, phone, setting, saw, t..."


In [17]:
# First attempt

In [18]:
from nltk.stem.snowball import SnowballStemmer

In [19]:
stemmer = SnowballStemmer('english')

def stemming(tokens):
    excluded = set(['iphone'])
    return [stemmer.stem(token) if token not in excluded else token for token in tokens]

In [20]:
data["review_text"].transform(stemming)

0       [ive, appl, sinc, day, 1, 2007, x, far, best, ...
1       [went, 6, matter, transfer, one, phone, anoth,...
2       [love, phone, upgrad, howev, fragil, difficult...
3       [super, excit, get, phone, preorder, wait, anx...
4       [hype, substanc, facial, recognit, system, nic...
5       [saw, first, ad, iphone, x, like, want, phone,...
6       [cant, understand, negat, review, own, everi, ...
7       [pick, x, releas, date, worri, home, button, w...
8       [durabl, glass, ever, laugh, attach, pic, ipho...
9       [within, hour, get, phone, set, saw, there, 8m...
10      [appl, done, 4k, 60fps, 4k, 24fps, one, even, ...
11      [like, iphon, ive, own, phone, function, perfe...
12      [respond, comment, earlier, post, iphone, x, s...
13      [surf, internet, realli, truli, fast, almost, ...
14      [admit, scare, order, phone, went, far, make, ...
15      [hate, new, phone, need, remodel, everyth, thi...
16      [love, screen, clariti, im, hard, time, let, g...
17      [ive, 

In [21]:
# Second attempt, yay

In [22]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [23]:
def lemming(tokens):
    return [lemmatizer.lemmatize(token, pos="v") for token in tokens]

In [24]:
data["review_text"] = data["review_text"].transform(lemming)

In [25]:
data.groupby("0").head(10)

Unnamed: 0,0,review_text
0,at&t,"[ive, apple, since, day, 1, 2007, x, far, best..."
1,at&t,"[go, 6, matter, transfer, one, phone, another,..."
2,at&t,"[love, phone, upgrade, however, fragile, diffi..."
3,at&t,"[super, excite, get, phone, preordered, wait, ..."
4,at&t,"[hype, substance, facial, recognition, system,..."
5,at&t,"[saw, first, ad, iphone, x, like, want, phone,..."
6,at&t,"[cant, understand, negative, review, own, ever..."
7,at&t,"[pick, x, release, date, worry, home, button, ..."
8,at&t,"[durable, glass, ever, laugh, attach, pic, iph..."
9,at&t,"[within, hour, get, phone, set, saw, theres, 8..."


In [26]:
# Custom tokens
def custom_lemming(tokens):
    processed = []
    append = processed.append
    for i, token in enumerate(tokens):
        if token == "iphone":
            continue
        if token == "x" or token == "10":
            append("iphoneX")
            continue
        if token == "6":
            if i>0 and tokens[i-1] == "iphone":
                append("iphone6")
            continue
        if token == "7":
            if i>0 and tokens[i-1] == "iphone":
                append("iphone7")
            continue
        if token == "face":
            append("faceid")
            continue
        if token == "id":
            if i>0 and tokens[i-1] == "face":
                append("faceid")
            continue
        append(token)
    return processed

In [27]:
data["review_text"] = data["review_text"].transform(custom_lemming)

In [28]:
# TF IDF
data.head()

Unnamed: 0,0,review_text
0,at&t,"[ive, apple, since, day, 1, 2007, iphoneX, far..."
1,at&t,"[go, matter, transfer, one, phone, another, ne..."
2,at&t,"[love, phone, upgrade, however, fragile, diffi..."
3,at&t,"[super, excite, get, phone, preordered, wait, ..."
4,at&t,"[hype, substance, facial, recognition, system,..."


In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=70, norm='l2', min_df=0.08, max_df=0.7, ngram_range=(1, 1))

In [37]:
tfidf_matrix = tfidf.fit_transform(data["review_text"].transform(lambda x: ' '.join(x)))

In [38]:
tfidf.get_feature_names()

['amaze',
 'apple',
 'best',
 'better',
 'button',
 'buy',
 'camera',
 'easy',
 'faceid',
 'far',
 'fast',
 'feature',
 'get',
 'good',
 'great',
 'home',
 'iphonex',
 'like',
 'love',
 'much',
 'new',
 'one',
 'phone',
 'plus',
 'really',
 'recognition',
 'screen',
 'size',
 'take',
 'upgrade',
 'use',
 'work',
 'would']