In [1]:
import pandas as pd
import numpy as np

train = pd.read_csv('./Data/train.csv')
test = pd.read_csv("./Data/test.csv")

print(train.columns)

print("train dimension: ", train.shape)
print("test dimension: ", test.shape)

Index(['review_id', 'review', 'rating'], dtype='object')
train dimension:  (146811, 3)
test dimension:  (60427, 2)


In [2]:
train.head(5)

Unnamed: 0,review_id,review,rating
0,0,Ga disappointed neat products .. Meletot Hilsn...,1
1,1,"Rdtanya replace broken glass, broken chargernya",1
2,2,Nyesel bngt dsni shopping antecedent photo mes...,1
3,3,Sent a light blue suit goods ga want a refund,1
4,4,Pendants came with dents and scratches on its ...,1


In [3]:
train['rating'] = train['rating']-1

In [5]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer

def tokenize(text, stop_set = False, lemma = False):
    
    # clean text
    text = text.encode('ascii', 'ignore').decode('ascii')
    text = text.lower()
    
    text = re.sub(r'\b(?:(?:https?|ftp)://)?\w[\w-]*(?:\.[\w-]+)+\S*', ' ', text) # remove hyperlink,subs charact in the brackets
    text = re.sub("[\r\n]", ' ', text) # remove new line characters
    #text = re.sub(r'[^\w\s]','',text)
    text = text.strip() ## convert to lowercase split indv words
    tknzr = TweetTokenizer()
    tokens = tknzr.tokenize(text)
    
    # retain tokens with at least two words
    tokens = [token for token in tokens if re.match(r'.*[a-z]{2,}.*', token)]
    if stop_set != False:
        tokens = [token for token in tokens if token not in stop_set]
    
    # lemmmatization - optional
    if lemma != False:
        tokens = [WordNetLemmatizer.lemmatize(token) for token in tokens]
    return tokens

# without lemmatization
train['tokens'] = train['review'].map(lambda x: tokenize(x,lemma = False))
test['tokens'] = test['review'].map(lambda x: tokenize(x,lemma = False))


In [6]:
test.head()

Unnamed: 0,review_id,review,tokens
0,1,"Great danger, cool, motif and cantik2 jg model...","[great, danger, cool, motif, and, cantik, jg, ..."
1,2,One of the shades don't fit well,"[one, of, the, shades, don't, fit, well]"
2,3,Very comfortable,"[very, comfortable]"
3,4,Fast delivery. Product expiry is on Dec 2022. ...,"[fast, delivery, product, expiry, is, on, dec,..."
4,5,it's sooooo cute! i like playing with the glit...,"[it's, sooooo, cute, like, playing, with, the,..."


In [7]:
def build_vocab(token_col):
    
    vocab = {}
    for tokens in token_col:
        for token in tokens:
            vocab[token] = vocab.get(token, 0) + 1

    return vocab

train_vocab = build_vocab(train['tokens'])
test_vocab = build_vocab(test['tokens'])

In [16]:
{k: v for k, v in sorted(train_vocab.items(), key=lambda item: item[1],reverse = True)}

{'good': 84720,
 'the': 84372,
 'product': 57881,
 'is': 54435,
 'quality': 51159,
 'very': 43967,
 'delivery': 37393,
 'to': 27963,
 'of': 26641,
 'and': 24112,
 'not': 23210,
 'seller': 22636,
 'price': 21511,
 'speed': 20582,
 'excellent': 19950,
 'but': 19541,
 'awesome': 19138,
 'fast': 18850,
 'goods': 18296,
 'for': 15000,
 'in': 14973,
 'it': 14711,
 'value': 12230,
 'response': 11470,
 'order': 11016,
 'nice': 10624,
 'you': 10255,
 'thank': 10145,
 'thanks': 9095,
 'that': 8504,
 'with': 8465,
 'service': 8262,
 'this': 7966,
 'like': 7937,
 'was': 7311,
 'also': 7119,
 'original': 7107,
 'other': 7103,
 'ok': 6977,
 'so': 6818,
 'well': 6629,
 'again': 6620,
 'money': 6396,
 'will': 6375,
 'packaging': 6207,
 'really': 6170,
 'time': 6146,
 'be': 5811,
 'shop': 5646,
 'its': 5560,
 'his': 5545,
 'my': 5541,
 'are': 5534,
 'buy': 5498,
 'as': 5421,
 'just': 5400,
 'okay': 5338,
 'color': 5129,
 'cp': 5085,
 'no': 4759,
 'products': 4734,
 'item': 4722,
 'received': 4610,
 'al

In [None]:
from gensim.models import KeyedVectors
news_path = './pretrained models/GoogleNews-vectors-negative300.bin'
embeddings_index = KeyedVectors.load_word2vec_format(news_path, binary=True)

In [None]:
import operator
def check_coverage(vocab,embedding):    
    oov = {}
    k = 0
    i = 0
    for word in vocab:
        if word in embedding:
            k += vocab[word]
        else:
            oov[word] = vocab[word]
            i += vocab[word]
    print('Found embeddings for  {:.2%} of all text'.format(k / (k + i)))
    sorted_x = sorted(oov.items(), key=operator.itemgetter(1))[::-1]
    return sorted_x
not_found_vocab = check_coverage(train_vocab, embeddings_index)

In [None]:
not_found_vocab

In [None]:
to_remove = ['to','of','and', 'bgt', 'sdh', 'udg', 'shopee']

replace_dict = {'cepet':'fast','pesen':'order','dah':'bye','lg':'again','ung':'rotten','nyampe':'arrived','yg':'which','cuman':'only',
'klo':'if','packingnya':'packing','gpp':'no problem','thx':'thanks','dapet':'get it','krn':'because','baguss':'good','gan':'bro',
'dateng':'come','pas':'just right','nyesel':'sorry','mksh':'thank you','trimakasih':'thank you','bgus':'great','smoga':'i hope',
'naman':'name','kirain':'i think','sya':'yes','pokoknya':'anyway','kok':'really','mantul':'really good','bangett':'really',
'makasi':'thanks','dong':'please','sellernya':'seller','gak':'no','uda':'already','bangettt':'already','tetep':'still','pesenan':'order',
'mudah2an':'i hope','smpai':'till','mah':'expensive','lgi':'again','lbh':'more','bagussss':'good','mantab':'steady','sukaa':'like',
'jga':'also','bnget':'really','kaka':'written','meletot':'erupted','rdtanya' : 'he asked','chargernya':'charger','dsni':'here',
'originalya':'original','jg':'too','deh':'okay','sdh':'already','tpi':'but','pokonya':'anyway','lah':'la','reallyt':'really',
'sma':'school','orderan':'orders','wrna':'color','againi':'again','kak':'sis','rekomended':'recommended','kayak':'like',
'blanja':'spend','likea':'like','becausea':'because','dlu':'previous','tau':'know','barang':'goods','dtng':'come','datang':'come',
'bnyk': 'a lot','mantep':'awesome','swhich':'which','banget':'really','goodssss':'goods','rada':'rather','packagingnya':'packaging',
'skrg':'now','pngiriman':'delivery','goodsss':'goods','prev':'previous','kan':'right','kek':'grandpa','lahh':'','lah':'','engga':'no',
'makasih':'thank you','ordernya ':'orders','paketan':'package','mantapp':'really','ngecewain':'disappointed','pengirimanya':'sender',
'bagus':'nice','comenya':'come','segini':'this much','knp':'why','bener':'right','kasi':'give','anak':'child','baik':'good','sukaa':'like',
'likeaaa':'like','bangeet':'really','brgnya':'how come','ngk':'presume','lagi':'again','lagii':'again','hrg':'price','harga':'price',
'penyok':'dent','penyok2':'dent','barangny':'goods','thanksi':'thanks','produk':'product','likeaaaaa':'like','murahhh':'cheap',
'terimaksih':'thanks','ownernya':'owner','thankss':'thanks','gercep':'speed','casenya':'case','kakk':'sis','dteng':'come',
'puasssss':'satisfied','masi':'still','sekali':'once','gapernah':'never','balikin':'return it','ancur':'broken','nyobain':'try it',
'bangat':'really','nyangka':'suspect','sekalii':'once','sekaliii':'once','sekaliiii':'once','sampaii':'arrive','barangnyaa':'goods',
'jaitannya':'linkage','nyari':'looking for it','bangeett':'really','disiniii':'here','abcdefghijklmnopqrstuvwxyz':'','priduk':'product',
'baguuuuuuus':'nice','allhamdulilah':'','mantaffffffffffffffff':'excellent','sekaliiiii':'once','ambilis':'take it','parahhh':'severe','Ingkan':'want',
'alhamdulillah':'','tq':'thanks'}

def clean_token(tokens, remove_list, re_dict):
    tokens = [token for token in tokens if token not in remove_list]
    tokens = [re_dict[token] if token in re_dict else token for token in tokens]
    return tokens

train['clean_tokens'] = train['tokens'].map(lambda x: clean_token(x, to_remove, replace_dict))
test['clean_tokens'] = test['tokens'].map(lambda x: clean_token(x, to_remove, replace_dict))

In [None]:
# Previous run
train_vocab = build_vocab(train['clean_tokens'])
test_vocab = build_vocab(test['clean_tokens'])

not_found_vocab = check_coverage(train_vocab, embeddings_index)
not_found_vocab

In [None]:
not_found_vocab.sort(key=lambda tup: tup[1],reverse = True)

In [None]:
not_found_vocab

In [None]:
def doc_mean(tokens, embedding):
    
    e_values = []
    e_values = [embedding[token] for token in tokens if token in embedding]
    
    if len(e_values) > 0:
        return np.mean(np.array(e_values), axis=0)
    else:
        return np.zeros(300)
      
X = np.vstack(train['clean_tokens'].apply(lambda x: doc_mean(x, embeddings_index)))
X_1 = np.vstack(test['clean_tokens'].apply(lambda x: doc_mean(x, embeddings_index)))

In [None]:
y = train['rating'].values

In [None]:
from sklearn import linear_model, tree, ensemble, metrics, model_selection, exceptions


def print_score(y_true, y_pred):
    print('macro_accuracy : ', metrics.accuracy_score(y_true, y_pred))
    print('ave macro precision : ', metrics.precision_score(y_true, y_pred, average='macro', pos_label=1, sample_weight=None))
    print('macro_recall : ', metrics.recall_score(y_true, y_pred, labels=None, pos_label=1, average='macro', sample_weight=None))
    print('macro_F1 : ', metrics.f1_score(y_true, y_pred, labels=None, pos_label=1, average='macro', sample_weight=None))
    
    print('micro_ave precision : ', metrics.precision_score(y_true, y_pred, average='micro', pos_label=1, sample_weight=None))
    print('micro_recall : ', metrics.recall_score(y_true, y_pred, labels=None, pos_label=1, average='micro', sample_weight=None))
    print('micro_F1 : ', metrics.f1_score(y_true, y_pred, labels=None, pos_label=1, average='micro', sample_weight=None))
    print('weighted_precision : ', metrics.precision_score(y_true, y_pred, average='weighted', pos_label=1, sample_weight=None))
    print('weighted_recall : ', metrics.recall_score(y_true, y_pred, labels=None, pos_label=1, average='weighted', sample_weight=None))
    print('weighted_F1 : ', metrics.f1_score(y_true, y_pred, labels=None, pos_label=1, average='weighted', sample_weight=None))
    
# train-test split
X_train, X_val, y_train, y_val = model_selection.train_test_split(X, y, test_size = 0.8, random_state = 2020)

In [None]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

param = {'booster': "gblinear", 
        'objective': "reg:squarederror", 
        'lambda': 0.1, 
        'alpha': 0}
num_round = 15 # the number of training iterations

# bst = GridSearchCV()

bst = xgb.train(param, dtrain, num_round)

In [None]:
preds = bst.predict(dval)

In [None]:
best_preds = [int(a+1+0.4) if a > 3 else int(a+1-0.4) for a in preds]
best_preds = np.clip(np.round(best_preds),1,5)
print_score(y_val, best_preds)