<a href="https://colab.research.google.com/github/mygetzu/shopee_product_detection/blob/master/shopee_product_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!unzip "./gdrive/My Drive/shopee-sentiment-analysis_dataset.zip"

Archive:  ./gdrive/My Drive/shopee-sentiment-analysis_dataset.zip
  inflating: sampleSubmission.csv    
  inflating: test.csv                
  inflating: train.csv               


In [None]:
import pandas as pd
import re
from datetime import datetime
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [None]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

len(df_train)

Unnamed: 0,review_id,review,rating
0,0,Ga disappointed neat products .. Meletot Hilsn...,1
1,1,"Rdtanya replace broken glass, broken chargernya",1
2,2,Nyesel bngt dsni shopping antecedent photo mes...,1
3,3,Sent a light blue suit goods ga want a refund,1
4,4,Pendants came with dents and scratches on its ...,1
...,...,...,...
146806,146806,Excellent product quality delivery speed is ve...,5
146807,146807,thanks gan,5
146808,146808,Awesome awesome quality merchandise value CP ...,5
146809,146809,Nice Packing boxes made effective price .........,5


In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn, stopwords, wordnet
from nltk.sentiment.vader import SentimentIntensityAnalyzer

nltk.download('vader_lexicon')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('sentiwordnet')

class SentimentAnalyzer:
    def __init__(self):
        self.sentimentAnalyzer = SentimentIntensityAnalyzer()
        self.stop_words = set(stopwords.words('english'))
        self.lemmatizer = WordNetLemmatizer()

    def preprocess_text(self, text):
        # Special Character Filtering
        text_char_filter = self.specialchar_filtering_text(text)
        # To lower case
        text_lower = self.to_lowercase_text(text_char_filter)
        # Tokenized
        tokenized = self.tokenize_text(text_lower)

        result = tokenized
        return result

    def specialchar_filtering_text(self, text):
        # print("\n======================== Special Char Filtering =========================")
        result = " ".join(re.findall("[a-zA-Z]+", text))
        return result

    def to_lowercase_text(self, text):
        # print("\n======================== Data case folding =========================")
        result = text.lower()
        return result

    def tokenize_text(self, text):
        # print("[", datetime.now(), "] Tokenizing data....")
        result = nltk.pos_tag(word_tokenize(text))
        return result

    def lemmatize_text(self, text, pos_tag):
        result = self.lemmatizer.lemmatize(text, pos_tag)
        # print(result)
        return result

    def get_wordnet_pos_tag(self, tag):
        tag_dict = {
            "J": wordnet.ADJ,
            "N": wordnet.NOUN,
            "V": wordnet.VERB,
            "R": wordnet.ADV,
        }

        return tag_dict.get(tag, wordnet.NOUN)

    def get_vader(self, word):
        return self.sentimentAnalyzer.polarity_scores(word)

    def get_vader_result(self, text):
        text_preprocessed = self.preprocess_text(text)
        vader = self.sentimentAnalyzer.polarity_scores(text_preprocessed)

        sentences_result = 0
        total = 0

        word_count = 0
        for word in text_preprocessed:
            if word[0] not in self.stop_words:
                try:
                    degree = self.get_vader(word)
                    result = self.get_aggregation(
                        degree['pos'], degree['neg'], degree['neu'])
                    # print("Result = ", result)

                    if result != None:
                        word_count += 1

                    sentences_result += result
                except:
                    continue

        if word_count > 0:
            sentences_result = sentences_result / word_count
        else :
            sentences_result = 0

        return (sentences_result - (-1)) / 2


    def get_wordnet_degree(self, word):
        pos_tag = self.get_wordnet_pos_tag(word[1][0])

        lemmatized = self.lemmatize_text(word[0], pos_tag)
        synset = swn.senti_synset('{0}.{1}.03'.format(lemmatized, pos_tag))

        return {
            'positive': synset.pos_score(),
            'negative': synset.neg_score(),
            'objective': synset.obj_score()
        }

    def get_aggregation(self, pos, neg, obj):
        pos_wordnet = 0
        neg_wordnet = 0
        result = 0

        if pos > neg:
            pos_wordnet = pos / (pos + neg)
            result = pos_wordnet - (obj * pos_wordnet)
        elif pos < neg:
            neg_wordnet = neg / (pos + neg) * -1
            result = neg_wordnet - (obj * neg_wordnet)
        else:
            result = 0

        return result

    def get_sentiwordnet(self, text):
        sentences_result = 0
        total = 0

        # print("Review : ", text)

        text_preprocessed = self.preprocess_text(text)

        word_count = 0
        for word in text_preprocessed:
            if word[0] not in self.stop_words:
                try:
                    degree = self.get_wordnet_degree(word)
                    result = self.get_aggregation(
                        degree['positive'], degree['negative'], degree['objective'])
                    # print("Result = ", result)

                    if result != None:
                        word_count += 1

                    sentences_result += result
                except:
                    continue

        # print("[", datetime.now(), "] Word count :", word_count)
        if word_count > 0:
            sentences_result = sentences_result / word_count
        else :
            sentences_result = 0

        # print("[", datetime.now(), "] Sentences result :", sentences_result)
        return (sentences_result - (-1)) / 2

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package sentiwordnet to /root/nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!




In [None]:
# Test sentiment
sentiment_analyzer = SentimentAnalyzer()
wordnet_score = sentiment_analyzer.get_sentiwordnet(df_train.iloc[0]['review'])
vader_score = sentiment_analyzer.get_vader_result(df_train.iloc[0]['review'])
normalized_wordnet = (wordnet_score - (-1)) / 2
print("Normalized wordnet : ", wordnet_score)
print("Normalized vader : ", vader_score)

Normalized wordnet :  0.6041666666666666
Normalized vader :  0.5


In [None]:
df_train['wordnet_score'] = df_train.apply(lambda x : sentiment_analyzer.get_sentiwordnet(x['review']), axis=1)
df_train['vader_score'] = df_train.apply(lambda x : sentiment_analyzer.get_vader_result(x['review']), axis=1)
df_train

Unnamed: 0,review_id,review,rating,wordnet_score,vader_score
0,0,Ga disappointed neat products .. Meletot Hilsn...,1,0.604167,0.5
1,1,"Rdtanya replace broken glass, broken chargernya",1,0.406250,0.5
2,2,Nyesel bngt dsni shopping antecedent photo mes...,1,0.468750,0.5
3,3,Sent a light blue suit goods ga want a refund,1,0.479167,0.5
4,4,Pendants came with dents and scratches on its ...,1,0.458333,0.5
...,...,...,...,...,...
146806,146806,Excellent product quality delivery speed is ve...,5,0.656250,0.5
146807,146807,thanks gan,5,0.500000,0.5
146808,146808,Awesome awesome quality merchandise value CP ...,5,0.645833,0.5
146809,146809,Nice Packing boxes made effective price .........,5,0.572917,0.5


In [None]:
df_test['wordnet_score'] = df_test.apply(lambda x : sentiment_analyzer.get_sentiwordnet(x['review']), axis=1)
df_test['vader_score'] = df_test.apply(lambda x : sentiment_analyzer.get_vader_result(x['review']), axis=1)
df_test

Unnamed: 0,review_id,review,wordnet_score,vader_score
0,0,slow delivery,0.375000,0.5
1,1,Dateng goods do not conform pesanan😔,0.812500,0.5
2,2,PSN k its 20 other DTG,0.500000,0.5
3,3,I am expected that it have a frame and painted...,0.479167,0.5
4,4,The product quality is not good.,0.687500,0.5
...,...,...,...,...
62913,62913,Has shipped It was the model that fits the cut...,0.472222,0.5
62914,62914,Awesome awesome speed of the ship's value CP,0.500000,0.5
62915,62915,but already in lebihin one of its usual size w...,0.509615,0.5
62916,62916,The product quality is excellent.,0.531250,0.5


In [None]:
# Normalization
normalization = preprocessing.MinMaxScaler()
df_train[['wordnet_score', 'vader_score']] = normalization.fit_transform(df_train[['wordnet_score', 'vader_score']])
df_test[['wordnet_score', 'vader_score']] = normalization.fit_transform(df_test[['wordnet_score', 'vader_score']])

X_train = df_train[['wordnet_score', 'vader_score']]
X_test = df_test[['wordnet_score', 'vader_score']]
y_train = df_train['rating']

In [None]:
# KNN Method

knn = KNeighborsClassifier(n_neighbors=5)

knn.fit(X_train, y_train)
y_predict = knn.predict(X_test)
print(df_test)

df_test['rating'] = df_test.apply(lambda x : y_predict[x.name], axis=1)

submission = df_test[['review_id', 'rating']]
# submission.to_csv('sample_data/submission_product_sentiment_3.csv', index=None)

       review_id  ... vader_score
0              0  ...         0.0
1              1  ...         0.0
2              2  ...         0.0
3              3  ...         0.0
4              4  ...         0.0
...          ...  ...         ...
62913      62913  ...         0.0
62914      62914  ...         0.0
62915      62915  ...         0.0
62916      62916  ...         0.0
62917      62917  ...         0.0

[62918 rows x 4 columns]
