In [1]:
import pandas as pd
import numpy as np
import gzip
from string import punctuation
import itertools
from nltk import tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from collections import Counter
from sklearn.linear_model import LogisticRegression

## Encontrando as palavras das reviews que mais influenciam em um rating 5

Variável independente: nota das reviews (coluna 'overall')

Variáveis explicativas: dummies que indicam a presença de uma determinada palavra no review

Modelo: regressão logística com regularização L1 (lasso)

Lendo a base de dados e preparando as variáveis

In [2]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient = 'index')

df = getDF('reviews_Video_Games_5.json.gz')

In [3]:
df['overall'].value_counts()

5.0    120185
4.0     54804
3.0     28275
1.0     14853
2.0     13663
Name: overall, dtype: int64

In [4]:
Y = list((df['overall'] == 5)*1)

Tokenizando o texto, colocando em caixa baixa, stemmizando e removendo stop words, termos de um caracter e termos que apareceram menos do que 1000 vezes.

In [5]:
textos = list(df['reviewText'])

In [6]:
sw = stopwords.words('english') + list(punctuation)

textos_limpos = []
c = Counter([])
for texto in textos:
    tlimpo = [token.lower() for token in tokenize.TweetTokenizer().tokenize(texto) if token.lower() not in sw]
    c.update(tlimpo)
    textos_limpos.append(tlimpo)

ps = PorterStemmer()
stemmed = []
for t in textos_limpos:
    txt = [ps.stem(token) for token in t]
    stemmed.append(txt)
stemmed = [[token for token in t if (c[token] >= 1000 and len(token) > 1)] for t in stemmed]

In [7]:
tokens = list(set(list(itertools.chain(*stemmed))))
tokens.sort()

In [8]:
len(tokens)

1647

In [9]:
tokens[0:5]

['):', '..', '...', '1/2', '10']

In [10]:
tokens[1000:1005]

['non', 'none', 'normal', 'nostalgia', 'notch']

In [11]:
X = np.empty((len(textos), len(tokens)))
for i in range(len(Y)):
    for j in range(len(tokens)):
        X[i,j] = int(tokens[j] in textos_limpos[i])

In [12]:
X = pd.DataFrame(X)
X.columns = tokens

In [13]:
X

Unnamed: 0,):,..,...,1/2,10,10/10,100,1080p,11,12,....1,yellow,yet,york,yoshi,young,younger,zelda,zero,zone,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231775,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231777,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Ajustando o modelo

In [14]:
log = LogisticRegression(penalty = 'l1', solver = 'liblinear')
log.fit(X, Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
coef = list(log.coef_[0])

In [16]:
len(coef)

1647

In [17]:
coef = dict(zip(tokens, coef))

Tokens com coeficientes mais baixos

In [18]:
sorted(coef.items(), key = lambda kv: kv[1])[0:50]

[('4/5', -1.9787529768720864),
 ('7/10', -1.4568451903059414),
 ('rental', -1.3745113390729675),
 ('worst', -1.2731888610383109),
 ('bland', -1.1027474229519703),
 ('rent', -0.9668081577284912),
 ('awful', -0.9456740569138659),
 ('boring', -0.9427628453037045),
 ('clunky', -0.8341847121912132),
 ('buggy', -0.777008004290507),
 ('ok', -0.7567120808820359),
 ('8/', -0.7478569193646105),
 ('useless', -0.7247156715041881),
 ('joke', -0.7241622942940587),
 ('poor', -0.7238158442532153),
 ('lame', -0.7080586945243136),
 ('alright', -0.6771310168842104),
 ('junk', -0.660976318069857),
 ('drm', -0.6579290261807966),
 ('okay', -0.6548785960501883),
 ('ea', -0.6294294589777458),
 ('trash', -0.626319012234961),
 ('dull', -0.6205401245583786),
 ('decent', -0.6146201162144037),
 ('pointless', -0.608452298293219),
 ('stiff', -0.5848993924757864),
 ('wasted', -0.5739603958129289),
 ('ridiculous', -0.5622059390983636),
 ('stupid', -0.536673050793197),
 ('annoying', -0.523882590348062),
 ('calibur', -0

Tokens com coeficientes mais positivos

In [19]:
sorted(coef.items(), key = lambda kv: kv[1], reverse = True)[0:50]

[('10/10', 1.5194881503298003),
 ('5/5', 1.516972162043164),
 ('flawless', 0.7401875137611786),
 ('awesome', 0.7226555226654416),
 ('amazing', 0.6850899624153899),
 ('love', 0.6178837158539019),
 (':)', 0.6019304342772464),
 ('recommended', 0.5992773176902366),
 ('best', 0.5645274738351554),
 ('gem', 0.5596147066713312),
 ('thank', 0.5536801165753242),
 ('condition', 0.5218762010305177),
 ('fantastic', 0.5163729896936534),
 ('kratos', 0.4997941799127992),
 ('9/10', 0.4885959378456917),
 ('amazed', 0.483624049366226),
 ('dualshock', 0.48257983085839923),
 ('spoil', 0.4767390638570015),
 ('steal', 0.46244055027754716),
 ('excellent', 0.4586072904614638),
 ('blown', 0.4568137051183089),
 ('regret', 0.45287265522938625),
 ('glad', 0.4509005845128674),
 ('complain', 0.4445499611299497),
 ('happy', 0.4437427439946954),
 ('disappoint', 0.43860794465684866),
 ('owner', 0.43332706560228235),
 ('wonderful', 0.4323808581643295),
 ('perfect', 0.4295171049590854),
 ('tone', 0.4293566355587009),
 ('

Podemos ver que notas como 4/5 e 10/10 possuem coeficientes bastante expressivos, o que é de se esperar visto que elas tem uma ligação direta com a nossa variável resposta. 

Entre os termos com coeficientes baixos, temos palavras negativas como 'worst', 'awful' e 'boring', e palavras que indicam que o jogo é razoável, mas não digno da nota máxima, como 'ok' e 'decent'.

Por sua vez, os coeficientes mais altos são de palavras positivas como 'flawless', 'amazing' e 'best'.

Finalmente, em ambas as listas, temos termos que indicam produtores, personagens e produtores de jogos, por exemplo, 'ea', 'kratos' e 'persona'.