In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, InputLayer
import nltk
import re
import random as rnd
from numpy.random import seed
from tensorflow import set_random_seed
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import lightgbm as lgb
import xgboost as xgb

RE_SPACES = re.compile("\s+")
RE_HASHTAG = re.compile("[@#][_a-z0-9]+")
RE_EMOTICONS = re.compile("(:-?\))|(:p)|(:d+)|(:-?\()|(:/)|(;-?\))|(<3)|(=\))|(\)-?:)|(:'\()|(8\))|(:-?D)")
RE_HTTP = re.compile("http(s)?://[/\.a-z0-9]+")
RE_NEGATOIN = re.compile("n't")

negations = set(["no", "not", "nope", "nah", "none", "zero"])
very = set(["very", "much", "lot", "many"])

Using TensorFlow backend.


In [4]:
class Tokenizer():
    @staticmethod
    def tokenize(text):
        pass
    
class BeforeTokenizationNormalizer():
    @staticmethod
    def normalize(text):
        if text != text: return ""
        text = text.strip().lower()
        text = text.replace('&nbsp;', ' ')
        text = text.replace('&lt;', '<')
        text = text.replace('&gt;', '>')
        text = text.replace('&amp;', '&')
        text = text.replace('&pound;', u'£')
        text = text.replace('&euro;', u'€')
        text = text.replace('&copy;', u'©')
        text = text.replace('&reg;', u'®')
        text = text.replace('.', ' ')
        text = text.replace(',', ' ')
        text = text.replace('!', ' ')
        text = text.replace('?', ' ')        
        return text
    
class SimpleTokenizer(Tokenizer):
    @staticmethod
    def tokenize(text):
       return text.split()

class NltkTokenizer(Tokenizer):
    @staticmethod
    def tokenize(text):
        return nltk.word_tokenize(text)

    
class TweetTokenizer(Tokenizer):
    @staticmethod
    def tokenize(text):
        tokens = SimpleTokenizer.tokenize(text)
        porter = nltk.PorterStemmer()
        i = 0
        while i < len(tokens):
            token = tokens[i]
            #propagacja negacji na dwa kolejne wyrazy
            if token in negations or RE_NEGATOIN.search(token) is not None:
                if i+1 < len(tokens):
                    tokens[i+1] = "NOT" + tokens[i+1]
                if i+2 < len(tokens):
                    tokens[i+2] = "NOT" + tokens[i+2]
            if token in very:
                if i+1 < len(tokens):
                    tokens[i+1] = "VERY" + tokens[i+1]
            match = None
            for re in (RE_EMOTICONS, RE_HASHTAG, RE_HTTP, RE_SPACES):
                match = re.search(token)
                if match is not None:
                    break
            if match is not None:
                pass
            else:
                del tokens[i]
                tokens[i:i] = NltkTokenizer.tokenize(token)
                tokens[i] = tokens[i].replace("(","")
                tokens[i] = tokens[i].replace(")","")
                tokens[i] = tokens[i].replace("'","")
            tokens[i] = porter.stem(tokens[i])
            i += 1
            
        
        
        return tokens

Wartość NA również niesie ze sobą jakąś informację (np telefon kupiony z jakiegoś podejrzanego źródła, lub używany), dlatego zastąpimy je odpowiednikiem w danym typie (0 dla int i pusty string dla stringów). Atrybuty product name mają skomplikowane wartości i łatwiej będzie się oprzeć na atrybucie brand name, który jest bardziej ogólny od product name.

In [3]:
rnd.seed(123)
seed(1)
set_random_seed(2)
test = pd.read_csv("test.csv")
train = pd.read_csv("train.csv")

train["Price"].fillna(0, inplace=True)
test["Price"].fillna(0, inplace=True)

train["Reviews"].fillna("", inplace=True)
test["Reviews"].fillna("", inplace=True)

test.drop(["Product Name"], axis=1, inplace=True)
train.drop(["Product Name"], axis=1, inplace=True)

In [4]:
train.head()

Unnamed: 0,Id,Brand Name,Price,Reviews,Rating
0,1,Samsung,199.99,I feel so LUCKY to have found this used (phone...,5
1,2,Samsung,199.99,"nice phone, nice up grade from my pantach revu...",4
2,3,Samsung,199.99,Very pleased,5
3,4,Samsung,199.99,It works good but it goes slow sometimes but i...,4
4,5,Samsung,199.99,Great phone to replace my lost phone. The only...,4


In [5]:
X_train, X_test, y_train, y_test = train_test_split(train.iloc[:,1:4], train.iloc[:,4], test_size = 0.1, random_state=42)

In [6]:
brand = dict(Counter(X_train["Brand Name"]))
for k, v in list(brand.items()):
    if v < 100:
        del brand[k]
        
len(brand)


28

In [7]:
X_train["List"] = X_train["Reviews"].map(lambda x: TweetTokenizer.tokenize(BeforeTokenizationNormalizer.normalize(x)))
X_test["List"] = X_test["Reviews"].map(lambda x: TweetTokenizer.tokenize(BeforeTokenizationNormalizer.normalize(x)))
test["List"] = test["Reviews"].map(lambda x: TweetTokenizer.tokenize(BeforeTokenizationNormalizer.normalize(x)))

brand = list(brand)
X_train["Brand Name"] = X_train["Brand Name"].map(lambda x: brand.index(x) if x in brand else len(brand))
X_test["Brand Name"] = X_test["Brand Name"].map(lambda x: brand.index(x) if x in brand else len(brand))
test["Brand Name"] = test["Brand Name"].map(lambda x: brand.index(x) if x in brand else len(brand))

In [8]:
X_test.head()

Unnamed: 0,Brand Name,Price,Reviews,List
118143,5,109.99,This phone has been pretty disappointing for m...,"[thi, phone, has, been, pretty, disappointing,..."
38470,7,124.0,Everything about this phone is amazing. I have...,"[everyth, about, this, phone, is, amazing, i, ..."
168128,24,49.99,I am very disappointed with the watch. As I wa...,"[i, am, very, disappointed, with, the, watch, ..."
70387,7,749.99,Ver y Good,"[ver, y, good]"
57563,7,615.0,I felt ripped off. It had an att SIM card in i...,"[i, felt, ripped, off, it, had, an, att, sim, ..."


Zliczymy wystąpienia pojedynczych termów dla danej oceny (1-5), usuniemy termy które w sumie występują mniej niż 100 razy. Prawdopodobnie są to literówki, niezbyt popularne wyrazy i raczej nie pojawią się w zbiorze testowym. 
Następnie ze względu na niezbalansowanie klas, podzielimy liczności wystąpień termów przez częstość danej oceny. 
Aby pozbyć się termów neutralnych (takich, które nie niosą informacji o emocjach) usuniemy te, których wartość bezwględna z korelacji liczności wystąpienia dla rosnących ocen z kolejnymi liczbami naturalnymi będzie mniejsza od zadanego progu 0.65

In [9]:
def countTerms(x,y):
    d = dict()
    for terms, score in zip(x["List"],y):
        if terms != terms:
            continue
        for w in terms:
            w = w.lower()
            if w not in d:
                d[w] = np.zeros((5))
            d[w][score-1] += 1
    return d

d = countTerms(X_train, y_train)
for k, v in list(d.items()):
    if sum(v) < 100:
        del d[k]
d

{'works': array([ 1283.,   788.,  1759.,  3761., 11153.]),
 'quite': array([152., 170., 309., 457., 823.]),
 'properly': array([754., 243., 192., 207., 190.]),
 'grey': array([29.,  4., 10., 34., 28.]),
 'figure': array([171., 160., 199., 270., 273.]),
 'nightmare': array([84.,  7., 10., 12.,  3.]),
 'thrilled': array([ 16.,   6.,   8.,  20., 142.]),
 'basics': array([11., 11.,  9., 33., 36.]),
 '7': array([254., 466., 133., 379., 943.]),
 'credit': array([145.,  35.,  13.,  32.,  93.]),
 'lte': array([269., 266., 275., 619., 937.]),
 '4g': array([ 437.,  285.,  277.,  556., 1123.]),
 'preloaded': array([ 2.,  6.,  9., 26., 70.]),
 'advise': array([99., 29., 15., 16., 29.]),
 'intended': array([22., 18., 11., 39., 36.]),
 'item': array([1305.,  448.,  279.,  459., 1818.]),
 'dropped': array([420., 200., 150., 280., 512.]),
 'kind': array([195., 164., 171., 310., 501.]),
 'then': array([2618., 1008.,  914., 1317., 2236.]),
 'phone:': array([12., 18., 33., 46., 41.]),
 'party': array([16

In [10]:
ratingCount = Counter(y_train)
for j in ratingCount:
    ratingCount[j] /= len(y_train)

for x in d:
    for j in range(5):
        d[x][j] /= ratingCount[j+1]
d

{'works': array([ 6842.71669201, 12960.18035891, 23278.89267833, 26290.96128785,
        20921.62573027]),
 'quite': array([ 810.67259329, 2795.97799621, 4089.35636021, 3194.62092756,
        1543.84452399]),
 'properly': array([4021.36273248, 3996.60384164, 2540.95929178, 1447.01648141,
         356.41611125]),
 'grey': array([154.6677974 ,  65.78771756, 132.34162978, 237.67420468,
         52.52447955]),
 'figure': array([ 912.00666745, 2631.50870232, 2633.59843262, 1887.41280184,
         512.11367564]),
 'nightmare': array([448.00327524, 115.12850573, 132.34162978,  83.88501342,
          5.62762281]),
 'thrilled': array([ 85.33395719,  98.68157634, 105.87330382, 139.80835569,
        266.3741463 ]),
 'basics': array([ 58.66709557, 180.91622328, 119.1074668 , 230.68378689,
         67.53147371]),
 '7': array([1354.67657036, 7664.2690955 , 1760.14367608, 2649.36834036,
        1768.94943635]),
 'credit': array([773.33898702, 575.64252863, 172.04411871, 223.69336911,
        174.4563

In [11]:
for k, v in list(d.items()):
    if abs(np.corrcoef(v, range(5))[0,1]) < 0.65:
        del d[k]

In [12]:
len(d)

1139

Funkcja przekształcająca krotki w wejście odpowiednie dla metod uczenia maszynowego oraz dokonująca ekstrakcji cech.
Lista termów zamieniana jest na reprezentację bag-of-words, zliczane też są znaki '!' oraz '?', które mogą świadczyć o emocjach, oraz długość pierwotnego tekstu. Informacja z brand name jest zamieniana na one hot encoding.

In [14]:
def makeInput(d, xx):
    terms = list(d)
    x = np.zeros((len(d)+4+len(brand)+1, len(xx["List"])))
    wordSet = set(terms)
    for i,word in enumerate(xx["List"]):
        for w in word:
            if w in wordSet:
                x[terms.index(w), i] += 1
        txt = xx.iloc[i, 2]
        if txt!=txt:
            continue
        x[len(d), i] = len(txt)
        x[len(d)+1, i] = txt.count("!")
        x[len(d)+2 ,i] = txt.count("?")
        x[len(d)+3 ,i] = xx.iloc[i,1]
        x[len(d)+4 + xx.iloc[i,0] ,i] = 1
    return np.transpose(x)

In [15]:
x = makeInput(d, X_train)
np.save('x.npy', x)

In [16]:
x=np.load('x.npy')

In [17]:
xTest = makeInput(d, X_test)
newX = makeInput(d, test.iloc[:,1:])

In [None]:
model = Sequential()
model.add(InputLayer(input_shape=(len(d)+4+len(brand)+1,)))
model.add(Dense(100, activation='relu'))
model.add(Dense(20, activation='relu'))
model.add(Dense(5, activation='relu'))
model.add(Dense(1, activation='linear'))
model.compile(loss='mse', optimizer='adam')

In [18]:
model.fit(x, y_train, batch_size=256, epochs=50, validation_data=(xTest, y_test))

Train on 182380 samples, validate on 20265 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f3e1fa2f940>

Mimo że przy większej liczbie epok błąd zarówno na zbiorze uczącym jak i walidującym spada i osiąga wartośc poniżej MSE<0.49, co oznaczałoby RMSE<0.7, to ocena na zbiorze testowym skutkuje RMSE > 1, co może oznaczać, że zbiór testowy nie jest losowo wybraną próbką.

In [19]:
gnb = GaussianNB()
nbmodel = gnb.fit(x, y_train)
y = nbmodel.predict(xTest)
np.sqrt(np.mean(np.power(y-y_test,2)))

1.4308814286885214

Naiwny Bayes nie osiągnał zadowalającego wyniku

In [23]:
d_train = lgb.Dataset(x, label=y_train)
params = {'objective': 'mse', 'max_depth': 250, 'sub_feature': 0.56, 'boosting_type': 'gbdt', 'learning_rate': 0.054}
clf = lgb.train(params, d_train, 25000)

In [21]:
best = 0.56
i = 0
for i in range(10):
    print(i)
    params = {}
    params['learning_rate'] = rnd.random()/10
    params['boosting_type'] = 'gbdt'
    params['objective'] = 'mse'
    params['sub_feature'] = rnd.random()*2/3+1/3
    params['max_depth'] = rnd.randint(2,1000)
    nRound = rnd.randint(100,10000)
    tclf = lgb.train(params, d_train, nRound)
    y = tclf.predict(xTest)
    if (best > np.sqrt(np.mean(np.power(y-y_test,2)))) :
        best = np.sqrt(np.mean(np.power(y-y_test,2)))
        clf = tclf
        print(best)
        print(params)
        print(nRound)

0
1
2
3
4
5
6
7
8
9


In [24]:
y = clf.predict(xTest)
np.sqrt(np.mean(np.power(y-y_test,2)))

0.5647188914349878

In [25]:
dtrain = xgb.DMatrix(x, label=y_train)
dtest = xgb.DMatrix(xTest, label=y_test)

param = {'max_depth': 200, 'eta': 0.8, 'silent': 1, 'objective':'reg:linear','subsample':0.8,'colsample_bytree':0.7}
param['nthread'] = 4
evallist = [(dtest, 'eval'), (dtrain, 'train')]
num_round = 10
bst = xgb.train(param, dtrain, num_round, evallist)

[0]	eval-rmse:1.12687	train-rmse:1.01304
[1]	eval-rmse:0.750136	train-rmse:0.461945
[2]	eval-rmse:0.688986	train-rmse:0.332114
[3]	eval-rmse:0.68001	train-rmse:0.272032
[4]	eval-rmse:0.675635	train-rmse:0.245868
[5]	eval-rmse:0.675739	train-rmse:0.229221
[6]	eval-rmse:0.674249	train-rmse:0.200293
[7]	eval-rmse:0.671341	train-rmse:0.192537
[8]	eval-rmse:0.671542	train-rmse:0.177447
[9]	eval-rmse:0.670701	train-rmse:0.175345


Najlepiej na zbiorze walidującym spisał się lgbm, więc użyjemy go do predykcji na zbiorze testowym

In [26]:
y = clf.predict(newX)
for i in range(len(y)):
    y[i] = max(min(5,y[i]),1)
y

array([5.        , 5.        , 3.98772953, ..., 4.20103887, 1.        ,
       2.25234524])

In [27]:
with open("submission.csv", "w") as f:
    f.write("Id,Rating\n")
    for i in range(len(y)):
        f.write(str(test.iloc[i,0])+","+str(y[i])+"\n")