## Datacamp CapGemini
#### Group 7

In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm
tqdm.pandas()

In [2]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

### Import data

In [6]:
raw_data = pd.read_csv('../data/data_scraping_V2.csv', engine='python')

In [7]:
# Remove ill-formated lines
raw_data = raw_data[raw_data.text.notnull()]
# Remove empty lines
mask = raw_data["text"].map(lambda x: x.strip() == '')
raw_data = raw_data.loc[~mask]

In [8]:
raw_data.source.value_counts()

bestbuy            16598
youtube             8816
reddit              8432
gsm arena           5843
twitter             4996
amazon               835
flipkart             446
at&t                 411
influenster          274
google shopping      153
Name: source, dtype: int64

In [9]:
# Remove youtube and twitter
excluded_sources = ["youtube", "twitter"]
data = raw_data[raw_data["source"].map(lambda x: x not in excluded_sources)].copy()

### Processing

#### First steps

In [10]:
# Remove ponctuation
matrix = str.maketrans(",\"_;", "    ", "'’.()/-?!|:><&[]*=@%^")
data["text"] = data["text"].transform(lambda x: x.translate(matrix))

In [11]:
# Remove bad characters
data["text"] = data["text"].transform(lambda text: ''.join([x for x in text if ord(x)<128]))

In [12]:
# Regex-based processing
import regex

# Remove hashtags
data["text"] = data["text"].map(lambda x: regex.sub('#[a-zA-Z0-9-]*', '', x))
# Remove number only strings
numbers = regex.compile('^[0-9 ]+$')
mask = data["text"].map(lambda x: not numbers.match(x))
data = data[mask]

ModuleNotFoundError: No module named 'regex'

In [None]:
# Remove empty lines
mask = data["text"].map(lambda x: x.strip() == '')
data = data.loc[~mask]

In [13]:
# Detect language
import langdetect
def detect_lang(x):
    try: 
        return langdetect.detect(x)
    except Exception as e:
        return None

data["lang"] = data["text"].progress_map(detect_lang)

ModuleNotFoundError: No module named 'langdetect'

In [303]:
# Remove reviews for which lang detect failed
data = data[~data["lang"].isnull()]
# Only keep english comments
data = data[data["lang"] == "en"]

#### Tokenize

In [304]:
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

tweet = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
stopwords = set(stopwords.words('english'))

# Tokenize
data["text"] = data["text"].transform(tweet.tokenize)

In [305]:
# Remove stopwords
data["text"] = data["text"].transform(lambda x: [token for token in x if not token in stopwords])

In [306]:
# Remove tokens with only numbers
numbers = regex.compile('^[0-9]{3,}$')
data["text"] = data["text"].map(lambda x: [token for token in x if not numbers.match(token)])

In [307]:
# Remove short lines
data = data[data["text"].apply(lambda x: len(x) > 5)]

In [308]:
# Checkpoint
data.groupby("source").head(10)

Unnamed: 0,source,text,lang
0,amazon,"[love, s8, awesome, screen, takes, great, pict...",en
1,reddit,"[mean, dont, think, thats, especially, healthy...",en
2,bestbuy,"[would, appear, open, box, like, new, samsung,...",en
7,bestbuy,"[though, iphones, beginning, feel, like, next,...",en
8,bestbuy,"[love, samsung, s8, edge, sleek, easy, use]",en
9,bestbuy,"[fast, smooth, silk, internet, browsing, best,...",en
11,reddit,"[apple, 12w, charger, ipads, charges, little, ...",en
13,bestbuy,"[long, time, ios, user, switched, take, advant...",en
14,reddit,"[dammit, cant, see, nose, shakes, fist]",en
15,bestbuy,"[easy, use, fastest, phone, thus, far]",en


### Stemming

##### First attempt

In [17]:
from nltk.stem.snowball import SnowballStemmer

In [18]:
stemmer = SnowballStemmer('english')

def stemming(tokens):
    excluded = set(['iphone'])
    return [stemmer.stem(token) if token not in excluded else token for token in tokens]

In [19]:
data["review_text"].transform(stemming)

0       [ive, appl, sinc, day, 1, 2007, x, far, best, ...
1       [went, 6, matter, transfer, one, phone, anoth,...
2       [love, phone, upgrad, howev, fragil, difficult...
3       [super, excit, get, phone, preorder, wait, anx...
4       [hype, substanc, facial, recognit, system, nic...
5       [saw, first, ad, iphone, x, like, want, phone,...
6       [cant, understand, negat, review, own, everi, ...
7       [pick, x, releas, date, worri, home, button, w...
8       [durabl, glass, ever, laugh, attach, pic, ipho...
9       [within, hour, get, phone, set, saw, there, 8m...
10      [appl, done, 4k, 60fps, 4k, 24fps, one, even, ...
11      [like, iphon, ive, own, phone, function, perfe...
12      [respond, comment, earlier, post, iphone, x, s...
13      [surf, internet, realli, truli, fast, almost, ...
14      [admit, scare, order, phone, went, far, make, ...
15      [hate, new, phone, need, remodel, everyth, thi...
16      [love, screen, clariti, im, hard, time, let, g...
17      [ive, 

### Lemmatization

In [309]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [310]:
def lemming(tokens):
    return [lemmatizer.lemmatize(token, pos="v") for token in tokens]

In [311]:
data["text"] = data["text"].transform(lemming)

In [312]:
data.groupby("source").head(10)

Unnamed: 0,source,text,lang
0,amazon,"[love, s8, awesome, screen, take, great, pictu...",en
1,reddit,"[mean, dont, think, thats, especially, healthy...",en
2,bestbuy,"[would, appear, open, box, like, new, samsung,...",en
7,bestbuy,"[though, iphones, begin, feel, like, next, cal...",en
8,bestbuy,"[love, samsung, s8, edge, sleek, easy, use]",en
9,bestbuy,"[fast, smooth, silk, internet, browse, best, i...",en
11,reddit,"[apple, 12w, charger, ipads, charge, little, b...",en
13,bestbuy,"[long, time, ios, user, switch, take, advantag...",en
14,reddit,"[dammit, cant, see, nose, shake, fist]",en
15,bestbuy,"[easy, use, fastest, phone, thus, far]",en


### Token cleaning

In [313]:
data["text"].sample(50).map(lambda x: print(" ".join(x)))

phone perfect size fit well hand screen amaze look
like iphone 8 seem like update frequently iphone past
im talk monetary term apple much larger team dedicate hardware ios
dude apples a11 bionic faster better sd835 faster even sd410 run better optimization os ios much lighter better optimize android thats need much power put android iphone work first android device put ios android device time faster
battery capacity less years s7 edge shall pass 3500mah still okay less would undesirable
best iphone date fast clear picture quickly adapt home button
go iphone 7 iphone 8 plus better upgrade expect speed hard beat love bigger screen
samsung galaxy s8 best phone market
didnt think larger screen would make much difference really like gold cover definitely like bigger screen
amaze unlock phone much faster previous model love edge perfect size small hand
products screen shape well accommodate attachment screen saver
fade work like magic guestures world experience n smooth best phone period
out

29696    None
5041     None
37662    None
5438     None
18513    None
2644     None
33048    None
9844     None
10402    None
22467    None
16543    None
27628    None
14318    None
41162    None
27554    None
8294     None
27646    None
27614    None
18064    None
38256    None
46289    None
37217    None
23463    None
35001    None
21676    None
10243    None
36375    None
43958    None
34131    None
45664    None
14611    None
20205    None
7094     None
17839    None
30739    None
36556    None
42336    None
34689    None
5015     None
35041    None
15182    None
29376    None
27853    None
30786    None
46049    None
191      None
32177    None
9074     None
29766    None
42509    None
Name: text, dtype: object

In [314]:
# Custom tokens
def custom_lemming(tokens):
    processed = []
    extend = processed.extend
    length = len(tokens)
    
    for i, token in enumerate(tokens):
        # iPhones
        if token == "x" or token == "10":
            result = ["10"]
            if i>0 and tokens[i-1] != "iphone":
                result.insert(0, "iphone")
            extend(result)
            continue
        if token in ["6", "7", "8"]:
            result = [token]
            if i>0 and tokens[i-1] != "iphone":
                result.insert(0, "iphone")
            extend(result)
            continue
        if token == "+":
            extend(["plus"])
        extend([token])
        
        # Samsung
    return processed

In [315]:
data["text"] = data["text"].transform(custom_lemming)

In [316]:
# Get bigrams
from gensim.models.phrases import Phrases, Phraser

phrases = Phrases(data["text"].values.tolist())
bigram = Phraser(phrases)

data["bigrams"] = list(bigram[data["text"].values.tolist()])

In [317]:
data["bigrams"].sample(50)

14083    [like, phone, operations, quick, reach, button...
27082    [dont_know, number, 9, skip, phone, great, siz...
35400    [camera_outstanding, speed, devise, unbelievab...
7718     [great, phone, much, feature, explore, upgrade...
32936        [decide, switch, iphone, best, decision, yet]
34625    [research, decide, hold, get, s7, wait, s8, gl...
31535    [upgrade_6s, iphone, 8P_lus, cant_believe, nic...
32332    [love, new, iphone_8, plus, speaker, immensely...
11510    [might, want, try, android, oreo, beta, s8, iv...
37927    [love, new, s8, much, s6, purchase, s8, new, s...
30732    [wish_could, change, alexa, , voice, speech, ...
40970    [want, say, sorry, long, post, hit, thumb, lik...
23618    [experience, iphone_8, plus_+, indoors, look, ...
17772         [well, maybe, os, definitely, build_quality]
20855    [haha, dear, brother, samsung, dont, expect, s...
45709    [choice, consider, new, replacement, phone, go...
29587    [par, iphone_7, plus, find, easily, better, sm.

### TF IDF

In [318]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=70, norm='l2', min_df=2, max_df=0.8, ngram_range=(1, 3))

In [319]:
tfidf_matrix = tfidf.fit_transform(data["bigrams"].transform(lambda x: ' '.join(x)).tolist())

In [320]:
print("Features : {}".format(", ".join(tfidf.get_feature_names())))

Features : also, amaze, android, apple, apps, awesome, battery, battery_life, best, better, buy, camera, case, come, display, dont, even, far, fast, feature, feel, first, get, go, good, great, great phone, im, iphone, iphone_10, iphone_6, iphone_7, iphone_8, issue, ive, know, like, look, lot, love, make, much, need, new, nice, one, people, phone, picture, plus, plus_, really, s8, samsung, say, screen, see, size, still, take, think, time, unlock, upgrade, use, want, way, well, work, would


In [321]:
pd.DataFrame(tfidf_matrix.todense(), columns=tfidf.get_feature_names()).replace(0, '')

Unnamed: 0,also,amaze,android,apple,apps,awesome,battery,battery_life,best,better,...,think,time,unlock,upgrade,use,want,way,well,work,would
0,,,,,,0.450657,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,0.466111,,,,,,0.533897,,,
2,,,,,,,,,,,...,,,,,,,,,,0.552658
3,,,,,,,,,,0.450003,...,,,,0.146241,,,,0.169136,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,0.756505,,...,,,,,,,,,,
6,0.470816,,,0.402651,,,,,,,...,,,,,,,,,,
7,,,0.41919,,,,,,,,...,,,,,0.281723,,,,,
8,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,


### NMF

In [322]:
from sklearn.decomposition import NMF

In [323]:
nmf = NMF(n_components=10, alpha=.1, l1_ratio=.5).fit(tfidf_matrix)

In [324]:
# tdidf = T * H
# H maps documents (articles) into new dimensions (in the case of NMF, we can interpret these as topics)
# W maps words to new dimensions
T = nmf.fit_transform(tfidf_matrix)
W = nmf.components_

In [325]:
pd.DataFrame(W)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,60,61,62,63,64,65,66,67,68,69
0,0.260787,0.759964,0.215642,0.0,0.145636,0.730639,0.187201,0.287669,1.39109,0.13381,...,0.0,0.376792,0.584909,0.368337,0.0,0.325144,0.216515,0.239511,0.2333,0.222646
1,0.215223,0.203417,0.0,0.0,0.107828,0.033584,0.163626,0.244158,0.0,0.7498,...,0.299096,0.206679,0.0,2.295007,0.0,0.474754,0.255062,0.246843,0.16223,0.359828
2,0.0,0.770951,0.0,0.0,0.018209,0.473762,0.0,0.03667,0.0,0.0,...,0.0,0.0,0.0,0.585754,0.0,0.0,0.0,0.0,0.0,0.0
3,0.006032,0.241702,0.0,0.0,0.143013,0.45416,0.129151,0.869574,0.0,0.211145,...,0.0,0.0,0.0,0.43786,0.0,0.0,0.0,0.106592,1.182631,0.0
4,0.494124,0.042885,0.194296,0.0,0.374702,0.0,0.245588,0.027839,0.0,0.353956,...,0.40875,0.948576,0.385066,0.0,8.339194,0.505394,0.356447,0.366488,0.760844,0.461888
5,0.075356,0.287429,0.94085,0.0,0.071428,0.134262,0.009018,0.118604,4.199089,0.318572,...,0.047018,0.150973,0.0,0.437337,0.0,0.105877,0.10936,0.0,0.124653,0.0
6,0.336632,0.0,0.363512,0.0,0.066265,0.0,0.702335,0.268477,0.379531,1.002799,...,0.522386,0.290021,0.03506,0.365173,0.0,0.297463,0.209445,0.201773,0.0,0.124794
7,0.339483,0.984454,0.0,0.0,0.212451,0.677067,0.221023,0.460662,0.0,0.174057,...,0.102461,0.27981,0.110762,0.0,0.0,0.086926,0.11902,0.26728,0.0,0.119604
8,0.864387,0.0,0.344154,0.0,0.353654,0.0,0.29516,0.225375,0.0,0.817186,...,1.39927,0.494444,0.254667,0.0,0.0,0.452154,0.382056,0.502389,0.995526,1.729318
9,0.089582,0.0,0.681455,10.344653,0.098907,0.0,0.0,0.0,0.0,0.169382,...,0.605522,0.374249,0.024229,0.0,0.0,0.302808,0.229496,0.315316,0.321978,0.592601


In [326]:
top = 10
features = tfidf.get_feature_names()

for i, dimension in enumerate(W):
    print("Topic #{}".format(i+1))
    feature_indexes = dimension.argsort()[:-top:-1]
    print("Words : {}".format(", ".join([features[i] for i in feature_indexes])))

Topic #1
Words : phone, best, buy, good, amaze, awesome, nice, far, unlock
Topic #2
Words : iphone_10, iphone_8, plus, upgrade, iphone_7, iphone_6, go, plus_, size
Topic #3
Words : love, new, feature, camera, amaze, upgrade, picture, awesome, size
Topic #4
Words : great, camera, great phone, work, feature, battery_life, picture, fast, take
Topic #5
Words : get, use, take, one, time, go, work, still, even
Topic #6
Words : iphone, best, one, android, far, new, upgrade, first, better
Topic #7
Words : samsung, s8, buy, better, go, one, plus_, battery, think
Topic #8
Words : screen, size, amaze, camera, nice, awesome, plus, look, battery_life
Topic #9
Words : like, really, good, look, dont, would, think, feature, make
Topic #10
Words : apple, make, say, android, people, think, would, even, im


### LDA

In [331]:
from gensim import models, corpora

In [334]:
# Create corpora dictionary
tokens_dict = corpora.Dictionary(data["bigrams"].values.tolist())
print(tokens_dict)

Dictionary(20891 unique tokens: ['awesome', 'great', 'love', 'picture', 's8']...)


In [336]:
# Filter extremes
tokens_dict.filter_extremes(no_below=3, no_above=0.7)
print(tokens_dict)

Dictionary(8080 unique tokens: ['awesome', 'great', 'love', 'picture', 's8']...)


In [341]:
# Create corpus
corpus = [tokens_dict.doc2bow(review) for review in data["text"].values.tolist()]
print(corpus[:3])

[[(0, 2), (1, 2), (2, 2), (3, 2), (4, 2), (5, 2), (6, 2)], [(7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1)], [(24, 1), (26, 1), (28, 1), (29, 1), (30, 1), (32, 1), (33, 1), (35, 1), (36, 1), (37, 1), (38, 2), (40, 1), (73, 1), (75, 1), (79, 1), (156, 1), (507, 1), (767, 1), (1258, 1), (2017, 1), (2588, 1), (4023, 1)]]


In [342]:
# Run the LDA (computation time should be between 5 to 60 seconds)

# choose the number of topics => to find a "good" number of topics, try multiple values and see which one is the best
# optionally: input alpha and eta to influence how topics are distributed across documents, 
#  and how words are distributed across topics
#  the syntax is the following
#  alpha is a vector of size the number of documents, and eta's size is the number of words
#  alpha = [0.01] * id2word_newspaper.num_docs for instance
#  eta = [0.01] * len(id2word_newspaper.keys())

num_topics = 40

# Below without alpha nor eta
%time lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=tokens_dict, passes=4)

# Below with alpha and eta
# %time lda_model = models.LdaModel(corpus, num_topics=num_topics, id2word=id2word_newspaper, passes=4, 
#                                   alpha=[0.01] * id2word_newspaper.num_docs, eta = [0.01] * len(id2word_newspaper.keys()))

CPU times: user 2min 46s, sys: 797 ms, total: 2min 47s
Wall time: 2min 48s


In [351]:
lda_model.show_topics(num_topics=num_topics, num_words=8, formatted=True)

[(0,
  '0.058*"play" + 0.054*"game" + 0.053*"color" + 0.035*"choice" + 0.030*"black" + 0.023*"many" + 0.016*"negative" + 0.015*"brightness"'),
 (1,
  '0.182*"battery" + 0.101*"life" + 0.053*"phone" + 0.052*"last" + 0.028*"day" + 0.025*"long" + 0.024*"better" + 0.017*"longer"'),
 (2,
  '0.045*"phone" + 0.038*"use" + 0.028*"set" + 0.026*"time" + 0.026*"like" + 0.019*"close" + 0.019*"turn" + 0.018*"get"'),
 (3,
  '0.076*"phone" + 0.042*"call" + 0.033*"work" + 0.030*"verizon" + 0.025*"unlock" + 0.019*"card" + 0.018*"sim" + 0.015*"carrier"'),
 (4,
  '0.044*"dont" + 0.036*"switch" + 0.032*"im" + 0.028*"u" + 0.026*"say" + 0.023*"like" + 0.023*"apple" + 0.023*"back"'),
 (5,
  '0.226*"good" + 0.088*"phone" + 0.058*"best" + 0.052*"far" + 0.035*"buy" + 0.032*"thank" + 0.025*"really" + 0.023*"market"'),
 (6,
  '0.155*"\x19" + 0.028*"people" + 0.020*"like" + 0.020*"say" + 0.016*"think" + 0.015*"get" + 0.014*"something" + 0.014*"go"'),
 (7,
  '0.144*"iphone" + 0.060*"much" + 0.057*"better" + 0.043*"

### Sentiment analysis

In [328]:
from textblob import TextBlob

In [329]:
def sentiment(text):
    try:
        return TextBlob(text).sentiment.polarity
    except:
        return None

data["sentiment"] = raw_data["text"].map(sentiment)

In [330]:
data.head()

Unnamed: 0,source,text,lang,bigrams,sentiment
0,amazon,"[love, s8, awesome, screen, take, great, pictu...",en,"[love, s8, awesome, screen, take, great, pictu...",0.875
1,reddit,"[mean, dont, think, thats, especially, healthy...",en,"[mean, dont, think, thats, especially, healthy...",0.2975
2,bestbuy,"[would, appear, open, box, like, new, samsung,...",en,"[would, appear, open_box, like, new, samsung_g...",0.377727
7,bestbuy,"[though, iphones, begin, feel, like, next, cal...",en,"[though, iphones, begin, feel, like, next, cal...",0.236538
8,bestbuy,"[love, samsung, s8, edge, sleek, easy, use]",en,"[love, samsung, s8, edge, sleek, easy_use]",0.466667
