In [14]:
import pandas as pd
import numpy as np
import gzip
from string import punctuation

import itertools

from nltk import tokenize
from sklearn.feature_extraction.text import CountVectorizer

from nltk.corpus import stopwords
from collections import Counter

from sklearn.linear_model import LogisticRegression

## Encontrando as palavras das reviews que mais influenciam em um rating 5

Variável independente: nota das reviews (coluna 'overall')

Variáveis explicativas: dummies que indicam a presença de uma determinada palavra no review

Modelo: regressão logística com regularização L1 (lasso)

Lendo a base de dados e preparando as variáveis

In [2]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient = 'index')

df = getDF('reviews_Video_Games_5.json.gz')

In [3]:
df['overall'].value_counts()

5.0    120185
4.0     54804
3.0     28275
1.0     14853
2.0     13663
Name: overall, dtype: int64

In [4]:
Y = list((df['overall'] == 5)*1)

Tokenizando o texto, colocando em caixa baixa, removendo stop words, termos de um caracter e termos que apareceram menos do que 1000 vezes.

In [5]:
textos = list(df['reviewText'])

In [6]:
sw = stopwords.words('english') + list(punctuation)

textos_limpos = []
c = Counter([])
for texto in textos:
    tlimpo = [token.lower() for token in tokenize.TweetTokenizer().tokenize(texto) if token.lower() not in sw]
    c.update(tlimpo)
    textos_limpos.append(tlimpo)
textos_limpos = [[token for token in t if (c[token] >= 1000 and len(token) > 1)] for t in textos_limpos]

In [7]:
tokens = list(set(list(itertools.chain(*textos_limpos))))
tokens.sort()

In [8]:
len(tokens)

3078

In [9]:
tokens[0:5]

['):', '..', '...', '1/2', '10']

In [10]:
tokens[1000:1005]

['failed', 'fails', 'failure', 'fair', 'fairly']

In [11]:
X = np.empty((len(textos), len(tokens)))
for i in range(len(Y)):
    for j in range(len(tokens)):
        X[i,j] = int(tokens[j] in textos_limpos[i])

In [12]:
X = pd.DataFrame(X)
X.columns = tokens

In [13]:
X

Unnamed: 0,):,..,...,1/2,10,10/10,100,1080p,11,12,....1,young,younger,youtube,zelda,zero,zombie,zombies,zone,zones,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231775,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231777,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Ajustando o modelo

In [16]:
log = LogisticRegression(penalty = 'l1', solver = 'liblinear')
log.fit(X, Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [22]:
coef = list(log.coef_[0])

In [23]:
len(coef)

3078

In [25]:
coef = dict(zip(tokens, coef))

Tokens com coeficientes mais baixos

In [30]:
sorted(coef.items(), key = lambda kv: kv[1])[0:50]

[('4/5', -2.0352190480018817),
 ('rental', -1.1996016088343748),
 ('7/10', -1.1945264247597138),
 ('worst', -1.1242882318229235),
 ('bland', -0.9777371384062946),
 ('lacks', -0.9212397708927108),
 ('unplayable', -0.9178912445339571),
 ('disappointment', -0.8769535254826584),
 ('terrible', -0.8744865349254697),
 ('disappointing', -0.8672980834945437),
 ('rent', -0.8574893169943194),
 ('garbage', -0.8383851703629718),
 ('8/', -0.8305424179003406),
 ('repetitive', -0.8257428324980286),
 ('waste', -0.8021515306377814),
 ('boring', -0.8007276711030619),
 ('horrible', -0.7838039963482352),
 ('suffers', -0.7555236134824114),
 ('freezes', -0.7462730568752874),
 ('unfortunately', -0.7405755198770786),
 ('awful', -0.7391402922349376),
 ('fails', -0.726361773122868),
 ('ok', -0.7197649371340991),
 ('clunky', -0.7146752911879642),
 ('worse', -0.691444700337098),
 ('joke', -0.6777047229576956),
 ('hopes', -0.6726035623973531),
 ('lame', -0.6674861263375449),
 ('falls', -0.656999924760497),
 ('renti

Tokens com coeficientes mais positivos

In [31]:
sorted(coef.items(), key = lambda kv: kv[1], reverse = True)[0:50]

[('5/5', 1.5096904783899738),
 ('10/10', 1.5015134719635013),
 ('masterpiece', 0.7981346585281509),
 ('skeptical', 0.7890887024267284),
 ('worried', 0.7424494537783783),
 ('penny', 0.717955061487637),
 ('flawless', 0.7034399287812274),
 ('awesome', 0.6928314271850846),
 ('amazing', 0.636662998072325),
 ('highly', 0.6340804886922735),
 ('awsome', 0.6241384039407908),
 ('loves', 0.6156318668238289),
 ('loving', 0.605872139166777),
 ('love', 0.5823002000299015),
 ('thank', 0.566171254574624),
 ('complaining', 0.546448186638365),
 ('best', 0.5434448839453128),
 ('incredible', 0.5364496660245935),
 (':)', 0.5324037213674148),
 ('addicting', 0.5275517294598525),
 ('hooked', 0.5014151443903112),
 ('sturdy', 0.4990698644574939),
 ('fantastic', 0.49782654915719854),
 ('addictive', 0.49602065862709754),
 ('amazingly', 0.4927751870160977),
 ('favorite', 0.4859086581977506),
 ('rocks', 0.4802121504095994),
 ('disappoint', 0.4738175394107412),
 ('kratos', 0.47222600434769507),
 ('gem', 0.4702845746

Podemos ver que notas como 5/5, e 7/10 possuem coeficiente extremamente altos, o que é esperado visto que essas notas são basicamente a nossa variável resposta. Entre os tokens com menos mais negativos, temos palavras negativas como 'worst', 'terrible', 'disappointment', 'terrible' e 'horrible', e palavras que sugerem que o produto é razoável, mas não digno da nota máxima, como 'alright', 'ok', e 'decent'.

Dentre os tokens com maiores coeficientes, temos palavras positivas como 'masterpiece', 'amazing', 'flawless', 'best' e 'awesome' e palavras que provavelmente sugerem o sentimento do usuário antes da compra do produto: 'skeptical' e 'worried'.