## Sentiment Analysis

In [2]:
import pandas as pd
import numpy as np
import re
import string
import emot
import json
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import pickle
import nltk
import gensim
from scipy import sparse
from sklearn.linear_model import LinearRegression,LogisticRegression
from sklearn.naive_bayes import MultinomialNB
stemmer = nltk.stem.porter.PorterStemmer()
EMOTICONS = emot.EMOTICONS

stopwords = ['she','the','is','to','of','a','i','for','in','it','you','with','that','this','was','so','will','be','my',
             'his','as','are','its','cp','have','from','there','tp','her','on','ga','at','by','if','has','me','d','does','he',
             'jd','we','your','they','here',"i'm",'aja','an','let','u','me','am','their']

mappings = {'cepet':'fast','pesen':'order','dah':'bye','lg':'again','ung':'rotten','nyampe':'arrived','yg':'which','cuman':'only',
'klo':'if','packingnya':'packing','gpp':'no problem','thx':'thanks','dapet':'get it','krn':'because','baguss':'good','gan':'bro',
'dateng':'come','pas':'just right','nyesel':'sorry','mksh':'thank you','trimakasih':'thank you','bgus':'great','smoga':'i hope',
'naman':'name','kirain':'i think','sya':'yes','pokoknya':'anyway','kok':'really','mantul':'really good','bangett':'really',
'makasi':'thanks','dong':'please','sellernya':'seller','gak':'no','uda':'already','bangettt':'already','tetep':'still','pesenan':'order',
'mudah2an':'i hope','smpai':'till','mah':'expensive','lgi':'again','lbh':'more','bagussss':'good','mantab':'steady','sukaa':'like',
'jga':'also','bnget':'really','kaka':'written','meletot':'erupted','rdtanya' : 'he asked','chargernya':'charger','dsni':'here',
'originalya':'original','jg':'too','deh':'okay','sdh':'already','tpi':'but','pokonya':'anyway','lah':'la','reallyt':'really',
'sma':'school','orderan':'orders','wrna':'color','againi':'again','kak':'sis','rekomended':'recommended','kayak':'like',
'blanja':'spend','likea':'like','becausea':'because','dlu':'previous','tau':'know','barang':'goods','dtng':'come','datang':'come',
'bnyk': 'a lot','mantep':'awesome','swhich':'which','banget':'really','goodssss':'goods','rada':'rather','packagingnya':'packaging',
'skrg':'now','pngiriman':'delivery','goodsss':'goods','prev':'previous','kan':'right','kek':'grandpa','lahh':'','lah':'','engga':'no',
'makasih':'thank you','ordernya ':'orders','paketan':'package','mantapp':'really','ngecewain':'disappointed','pengirimanya':'sender',
'bagus':'nice','comenya':'come','segini':'this much','knp':'why','bener':'right','kasi':'give','anak':'child','baik':'good','sukaa':'like',
'likeaaa':'like','bangeet':'really','brgnya':'how come','ngk':'presume','lagi':'again','lagii':'again','hrg':'price','harga':'price',
'penyok':'dent','penyok2':'dent','barangny':'goods','thanksi':'thanks','produk':'product','likeaaaaa':'like','murahhh':'cheap',
'terimaksih':'thanks','ownernya':'owner','thankss':'thanks','gercep':'speed','casenya':'case','kakk':'sis','dteng':'come',
'puasssss':'satisfied','masi':'still','sekali':'once','gapernah':'never','balikin':'return it','ancur':'broken','nyobain':'try it',
'bangat':'really','nyangka':'suspect','sekalii':'once','sekaliii':'once','sekaliiii':'once','sampaii':'arrive','barangnyaa':'goods',
'jaitannya':'linkage','nyari':'looking for it','bangeett':'really','disiniii':'here','abcdefghijklmnopqrstuvwxyz':'','priduk':'product',
'baguuuuuuus':'nice','allhamdulilah':'','mantaffffffffffffffff':'excellent','sekaliiiii':'once','ambilis':'take it','parahhh':'severe','Ingkan':'want'}

df_train = pd.read_csv('./Data/train.csv')
df_test = pd.read_csv('./Data/test.csv')
df_train.drop(['review_id'],axis = 1,inplace=True)
df_train['rating'] = df_train['rating']-1

In [3]:
def count_of_caps(data):
    count = 0
    text = data.split(' ')
    for each in text:
        if each.isupper():
            count+=1
    return count

def count_of_words(data):
    cnt = 0
    for text in data.split(' '):
        cnt +=1
    return cnt

df_train['cap_count'] = df_train['review'].apply(lambda x : count_of_caps(x))
df_train['word_count'] = df_train['review'].apply(lambda x: count_of_words(x))
df_train['len'] = df_train['review'].str.len()

df_test['cap_count'] = df_test['review'].apply(lambda x : count_of_caps(x))
df_test['word_count'] = df_test['review'].apply(lambda x: count_of_words(x))
df_test['len'] = df_test['review'].str.len()

df_train = df_train[['review','cap_count','word_count','len','rating']]
df_test = df_test[['review','cap_count','word_count','len']]

def count_of_words_frame(data):
    cnt = Counter()
    for text in data:
        for word in text.split():
            cnt[word] +=1
    return cnt

train_count = count_of_words_frame(df_train['review'].str.lower())
train_count = {k: v for k, v in sorted(train_count.items(), key=lambda item: item[1],reverse = True)}

test_count = count_of_words_frame(df_test['review'].str.lower())
test_count = {k: v for k, v in sorted(test_count.items(), key=lambda item: item[1],reverse = True)}

In [4]:
def convert_emoticons(data):
    for emot in EMOTICONS:
        data = re.sub(u'('+emot+')'," ".join(EMOTICONS[emot].replace(",","").split()),data)        
    return data

def replace_all(text, dic):
    for i, j in dic.items():
        text = text.replace(i, j)
    return text

# Removal of illegal characters
df_train['review'] = df_train['review'].apply(lambda x: x.encode('ascii','ignore').decode('utf-8'))
df_test['review'] = df_test['review'].apply(lambda x: x.encode('ascii','ignore').decode('utf-8'))

df_train['review'] = df_train['review'].apply(lambda x: convert_emoticons(x))
df_test['review'] = df_test['review'].apply(lambda x: convert_emoticons(x))

df_train['review'] = df_train['review'].apply(lambda x: x.lower())
df_test['review'] = df_test['review'].apply(lambda x: x.lower())

df_train['review'] = df_train['review'].apply(lambda x : replace_all(x,mappings))

def tokenize_stem(text):
    doc = nltk.word_tokenize(text)
    tokens = [w for w in doc if re.search('^[a-z]+$', w)]
    clean = [w for w in tokens if w not in stopwords]    
    final = [stemmer.stem(w) for w in clean] 
    return final

df_train['tokens'] = df_train['review'].apply(lambda x : tokenize_stem(x))
df_test['tokens'] = df_test['review'].apply(lambda x : tokenize_stem(x))

tokens = df_train['tokens'].tolist()
dictionary = gensim.corpora.Dictionary(tokens)
dictionary.filter_extremes(no_below=10)
corpus = [dictionary.doc2bow(token) for token in tokens]
tfidf = gensim.models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]

In [6]:
df_train['rating'] = df_train['rating'].astype('category')

In [28]:
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(df_train.iloc[:,:-1],df_train['rating'],test_size = 0.15,random_state = 2021)

In [58]:
vec = TfidfVectorizer(min_df = 0.01,stop_words = stopwords)
tfidf_train = vec.fit_transform(X_train['review'])

tfidf_val = vec.transform(X_val['review'])
tfidf_test = vec.transform(df_test['review'].tolist())

In [59]:
feat_array_train = X_train[['cap_count','word_count','len']].values
feat_array_val = X_val[['cap_count','word_count','len']].values
feat_array_test = df_test[['cap_count','word_count','len']].values

In [60]:
sparse_feat_train = sparse.csr_matrix(feat_array_train)
feats_train = sparse.hstack([tfidf_train,sparse_feat_train])

sparse_feat_val = sparse.csr_matrix(feat_array_val)
feats_val = sparse.hstack([tfidf_val,sparse_feat_val])

sparse_feat_test = sparse.csr_matrix(feat_array_test)
feats_test = sparse.hstack([tfidf_test,sparse_feat_test])

In [61]:
mb = MultinomialNB()
mb.fit(feats_train,y_train)

MultinomialNB()

In [62]:
mb.score(feats_train,y_train)

0.46433579882842235

In [63]:
feats_train.shape,y_train.shape,feats_val.shape,y_val.shape

((124789, 977), (124789,), (22022, 977), (22022,))

In [64]:
from sklearn.metrics import accuracy_score
accuracy_score(y_val,mb.predict(feats_val))

0.44687131050767415

In [None]:
lr_model.predict(feats_test)

### Tensorflow Implementation

In [None]:
from tensorflow.keras.preprocessing.text import one_hot
voc_size = 12000

one_hot_repr = [one_hot(words,voc_size) for words in df_train['review'].values]

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

sent_length = 20
embedded_docs = pad_sequences(one_hot_repr,padding='pre',maxlen = sent_length)
print(embedded_docs)

In [None]:
#print(stopwords.words('english'))
dim = 500
model = Sequential()
model.add(Embedding(voc_size,dim,input_length=sent_length))
model.compile('adam','mse')

model.summary()

In [None]:
print(model.predict(embedded_docs))

In [None]:
embedded_docs[0]

In [None]:
x = model.predict(embedded_docs)[0]

In [None]:
x.shape