In [1]:
import pandas as pd
import numpy as np
import gzip
from string import punctuation
import itertools
from nltk import tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from collections import Counter
from sklearn.linear_model import LogisticRegression

## Encontrando as palavras das reviews que mais influenciam em um rating 5

Variável independente: dummy que indica se a review possui nota 5 

Variáveis explicativas: dummies que indicam a presença de uma determinada palavra no review

Modelo: regressão logística com regularização L1 (lasso)

Lendo a base de dados e preparando as variáveis

In [2]:
def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield eval(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient = 'index')

df = getDF('reviews_Video_Games_5.json.gz')

In [3]:
df['overall'].value_counts()

5.0    120185
4.0     54804
3.0     28275
1.0     14853
2.0     13663
Name: overall, dtype: int64

In [4]:
Y = list((df['overall'] == 5)*1)

Tokenizando o texto, colocando em caixa baixa, stemmizando e removendo stop words, termos de um caracter e termos que apareceram menos do que 1000 vezes.

In [5]:
textos = list(df['reviewText'])

In [6]:
sw = stopwords.words('english') + list(punctuation) + ["..", "...", "....", "....."]

textos_limpos = []
c = Counter([])
for texto in textos:
    tlimpo = [token.lower() for token in tokenize.TweetTokenizer().tokenize(texto) if token.lower() not in sw]
    c.update(tlimpo)
    textos_limpos.append(tlimpo)

ps = PorterStemmer()
stemmed = []
for t in textos_limpos:
    txt = [ps.stem(token) for token in t]
    stemmed.append(txt)
stemmed = [[token for token in t if (c[token] >= 1000 and len(token) > 1)] for t in stemmed]

In [7]:
tokens = list(set(list(itertools.chain(*stemmed))))
tokens.sort()

In [8]:
len(tokens)

1645

In [9]:
tokens[0:5]

['):', '1/2', '10', '10/10', '100']

In [10]:
tokens[1000:1005]

['normal', 'nostalgia', 'notch', 'note', 'nothing']

In [11]:
X = np.empty((len(textos), len(tokens)))
for i in range(len(Y)):
    for j in range(len(tokens)):
        X[i,j] = int(tokens[j] in textos_limpos[i])

In [12]:
X = pd.DataFrame(X)
X.columns = tokens

In [13]:
X

Unnamed: 0,):,1/2,10,10/10,100,1080p,11,12,13,14,...,yellow,yet,york,yoshi,young,younger,zelda,zero,zone,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231775,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231777,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
231778,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Ajustando o modelo

In [14]:
log = LogisticRegression(penalty = 'l1', solver = 'liblinear')
log.fit(X, Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
coef = list(log.coef_[0])

In [16]:
len(coef)

1645

In [17]:
coef = dict(zip(tokens, coef))

Tokens com coeficientes mais baixos

In [18]:
sorted(coef.items(), key = lambda kv: kv[1])[0:50]

[('4/5', -1.974177135394126),
 ('7/10', -1.462132249148862),
 ('rental', -1.3712480762488093),
 ('worst', -1.2737437631198123),
 ('bland', -1.1033235703705846),
 ('rent', -0.9706287050399959),
 ('awful', -0.9478319578450632),
 ('boring', -0.944272664465369),
 ('clunky', -0.832135780744776),
 ('buggy', -0.7784969276198399),
 ('ok', -0.7631213539875346),
 ('8/', -0.751479566435332),
 ('joke', -0.7310879091811244),
 ('poor', -0.7266647830774541),
 ('useless', -0.7260689280614793),
 ('lame', -0.7132510085307414),
 ('alright', -0.6822696599942707),
 ('junk', -0.6646420926485762),
 ('okay', -0.6626701912621137),
 ('drm', -0.6602903118391037),
 ('ea', -0.6325129385857988),
 ('trash', -0.629327220882714),
 ('dull', -0.6215319083718879),
 ('decent', -0.6145253410948606),
 ('pointless', -0.6140753289772608),
 ('stiff', -0.5806791866560792),
 ('wasted', -0.5752851118808294),
 ('ridiculous', -0.5646659532835848),
 ('stupid', -0.5412650047566439),
 ('annoying', -0.5244490806361439),
 ('calibur', -0

Tokens com coeficientes mais positivos

In [19]:
sorted(coef.items(), key = lambda kv: kv[1], reverse = True)[0:50]

[('10/10', 1.5189565421395879),
 ('5/5', 1.51308541491324),
 ('flawless', 0.7355371462447637),
 ('awesome', 0.718963140720524),
 ('amazing', 0.6837222632442596),
 ('love', 0.616825036301217),
 ('recommended', 0.5989982234939347),
 (':)', 0.5889964310780436),
 ('best', 0.5660456036299373),
 ('gem', 0.5534201097477942),
 ('thank', 0.5503053539962277),
 ('condition', 0.5267273964259365),
 ('fantastic', 0.5147945609544784),
 ('kratos', 0.49918715642336997),
 ('9/10', 0.48833278400648356),
 ('dualshock', 0.48187745885555205),
 ('amazed', 0.48035650706943434),
 ('spoil', 0.4779269920458842),
 ('excellent', 0.4600727760779386),
 ('steal', 0.45942549975621766),
 ('blown', 0.4566398289545396),
 ('regret', 0.45258372436900096),
 ('glad', 0.45251118869183127),
 ('complain', 0.4466730615878038),
 ('happy', 0.44539208694087096),
 ('disappoint', 0.43767301299701145),
 ('owner', 0.4330328791716981),
 ('tone', 0.43286123009239635),
 ('wonderful', 0.4325025535764835),
 ('perfect', 0.42874615787249887),

Podemos ver que notas como 4/5 e 10/10 possuem coeficientes bastante expressivos, o que é de se esperar visto que elas tem uma ligação direta com a nossa variável resposta. 

Entre os termos com coeficientes baixos, temos palavras negativas como 'worst', 'awful' e 'boring', e palavras que indicam que o jogo é razoável, mas não digno da nota máxima, como 'ok' e 'decent'.

Por sua vez, os coeficientes mais altos são de palavras positivas como 'flawless', 'amazing' e 'best'.

Finalmente, em ambas as listas, temos termos que indicam produtores, personagens e produtores de jogos, por exemplo, 'ea', 'kratos' e 'persona'.