In [1]:
from gensim.models import FastText
from gensim.models import fasttext
import numpy as np
import pandas as pd
import io, os
import nltk, time, re, string, pickle
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory, StopWordRemover, ArrayDictionary
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.model_selection import train_test_split
from sklearn import metrics
import lightgbm as lgb
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [2]:
import ast 
def get_file():
    with open('dataset/normalisasi.txt') as f:
        data_normalisai = f.read()
    normalization_words = ast.literal_eval(data_normalisai)

    with open('dataset/stopwords.txt') as f:
        data_stopwords = f.read()
        stopwords = ast.literal_eval(data_stopwords)

    return normalization_words, stopwords

normalization_words, stopwords = get_file()

def normalisasi(texts):
    finalText = []
    splitted_text = texts.split()
    for text in splitted_text:
        if text in normalization_words:
            finalText.append(normalization_words[text])
        else:
            finalText.append(text)
      
    return " ".join(finalText)

def hapus_stopword(text):
#     stopword_factory = stopwords

    sw_dict = ArrayDictionary(stopwords)
    temp = StopWordRemover(sw_dict)

    text = temp.remove(text)
    return text

# def tokenize(text):
#     text = nltk.tokenize.word_tokenize(text)
#     return text

def hapus_duplikasi_kata(text):
    res = []
    text = text.split()
    for i in text:
        if i in res:
            text.remove(i)
        else:
            res.append(i)
    return " ".join(text)

def stemming(text):
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    text = stemmer.stem(text)
    return text

def case_folding(text):
    text = text.lower()
    # remove space in front of and at the end text
    text = text.strip()
    # remove space
    text = re.sub(r'\s+', ' ', text)
    # remove number
    text = re.sub(r"\d+", " ", text)
    # remove punctuation
    for i in text:
        if i in list(string.punctuation):
            text = text.replace(i, " ")

    return text

def preprocessing_data(opinion):
    opinion = case_folding(opinion)
    opinion = normalisasi(opinion)
    opinion = hapus_stopword(opinion)
    opinion = hapus_duplikasi_kata(opinion)
    opinion = stemming(opinion)
    return opinion

In [73]:
train=pd.read_excel("DataKuesioner_Done.xlsx")
train.head()

Unnamed: 0,Aspect,Sentiment,Tempat,Opinion
0,Tempat,Positif,4,Karena tempatnya memadai dan antrian teratur
1,Tempat,Negatif,2,lumayan berdesakan dan tdk menerapkan social d...
2,Tempat,Positif,3,terkadang terlalu ramai
3,Tempat,Positif,4,Tempat bersih dan luas
4,Tempat,Positif,4,Tempat vaksinasi sangat layak karena pihak pen...


In [4]:
test=pd.read_excel("app/dataset/DatasetMedsosPlus.xlsx")
test.head()

Unnamed: 0,Opinion,Aspect,Sentiment
0,Vaksinasi saat ini sudah mudah ditemukan diman...,Informasi,Positif
1,banyak infonya,Informasi,Positif
2,Karena dapat info vaksin hanya dari kenalan yg...,Informasi,Negatif
3,Jd ak puas km ak akhirnya tny ke temenku dan d...,Informasi,Positif
4,Puas krn informasi udh bnyk beredar d twitter ...,Informasi,Positif


# Fasttext

In [70]:
opinion = [preprocessing_data(sentence) for sentence in train.Opinion]

In [6]:
sentences = []
for index, row in train.iterrows():
    sentences.append(preprocessing_data(row['Opinion']))
new_sentences = [line.rstrip().split() for line in sentences]

In [7]:
ftmodel = FastText(vector_size=4, window=3, min_count=1,workers=4,sg=0,hs=0)  # instantiate
ftmodel.build_vocab(new_sentences)
ftmodel.train(new_sentences, total_examples=len(new_sentences), epochs=1000)  # train

(7649795, 13717000)

In [7]:
# ftmodel.wv.most_similar("sertif")
'covid' in ftmodel.wv.key_to_index

True

In [8]:
'pedulilindungi' in ftmodel.wv.key_to_index

True

In [85]:
vec = ftmodel.wv['pedulilindungi']
vec

array([ 1.6861366 ,  5.658009  , -0.10122687, -0.2918915 ], dtype=float32)

In [86]:
ftmodel.save("ft_model_K.fasttext")

In [None]:
ft = FastText.load("ft_model_K.fasttext").wv

In [89]:
'pedulilindungi' in ft.key_to_index

True

### Input to the classification model

In [105]:
def norm_vectorize(sentence):
    sentence = preprocessing_data(sentence)
    vecs = [ftmodel.wv[word] for word in word_tokenize(sentence)]
    norm_vecs=[vec/np.linalg.norm(vec) for vec in vecs if np.linalg.norm(vec) > 0]
    sent_vec=np.mean(norm_vecs, axis=0)
    return abs(sent_vec)

In [106]:
#Training
vecs_train = [norm_vectorize(sentence) for sentence in train.Opinion]
vectors_train = np.array(vecs_train, dtype=object)

#Testing
vecs = [norm_vectorize(sentence) for sentence in test.Opinion]
vectors = np.array(vecs)
vectors

array([[0.00557178, 0.07414879, 0.28766286, 0.75466776],
       [0.546922  , 0.09524092, 0.05875815, 0.3720318 ],
       [0.13127181, 0.16768269, 0.28982255, 0.8181853 ],
       ...,
       [0.03228225, 0.7340293 , 0.25319597, 0.09849533],
       [0.46890587, 0.7073706 , 0.07470596, 0.02498829],
       [0.52211255, 0.6371721 , 0.04989436, 0.19858241]], dtype=float32)

### Sentiment

In [107]:
X_train_sentiment = vectors_train
y_train_sentiment = train.Sentiment

X_test_sentiment = vectors
y_test_sentiment = test.Sentiment

X_train_sentiment.shape, X_test_sentiment.shape, y_train_sentiment.shape, y_test_sentiment.shape

((2678, 4), (558, 4), (2678,), (558,))

### LGBM

In [108]:
LGBMmodel = lgb.LGBMClassifier(boosting_type='dart', max_depth=4,num_leaves=8,n_estimators=200)
LGBMmodel.fit(X_train_sentiment, y_train_sentiment) 

LGBMClassifier(boosting_type='dart', max_depth=4, n_estimators=200,
               num_leaves=8)

In [109]:
y_pred=LGBMmodel.predict(X_test_sentiment)

print("Accuracy:",metrics.accuracy_score(y_test_sentiment, y_pred))
print("Precision:",metrics.precision_score(y_test_sentiment, y_pred, average='micro'))
print("Recall:",metrics.recall_score(y_test_sentiment, y_pred, average='micro'))
print("F-1Score:",metrics.f1_score(y_test_sentiment, y_pred, average='micro'))

tn, fp, fn, tp = confusion_matrix(y_test_sentiment, y_pred).ravel()
(tn, fp, fn, tp)

Accuracy: 0.7544802867383512
Precision: 0.37859712230215825
Recall: 0.49763593380614657
F-1Score: 0.43003064351378956


(0, 135, 2, 421)

### RF

In [110]:
#Import Random Forest Model
RFModel=RandomForestClassifier(max_depth=40, min_samples_split=4)
RFModel.fit(X_train_sentiment,y_train_sentiment)

RandomForestClassifier(max_depth=40, min_samples_split=4)

In [111]:
#Predict the response for test dataset
y_pred=RFModel.predict(X_test_sentiment)

In [112]:
print("Accuracy:",metrics.accuracy_score(y_test_sentiment, y_pred))
print("Precision:",metrics.precision_score(y_test_sentiment, y_pred, average='micro'))
print("Recall:",metrics.recall_score(y_test_sentiment, y_pred, average='micro'))
print("F-1Score:",metrics.f1_score(y_test_sentiment, y_pred, average='micro'))

tn, fp, fn, tp = confusion_matrix(y_test_sentiment, y_pred).ravel()
(tn, fp, fn, tp)

Accuracy: 0.7473118279569892
Precision: 0.5835227272727272
Recall: 0.5231678486997636
F-1Score: 0.4985947806137081


(12, 123, 18, 405)

### SVM

In [113]:
SVMmodel = svm.SVC(probability=True)
SVMmodel.fit(X_train_sentiment, y_train_sentiment)

SVC(probability=True)

In [114]:
y_pred = SVMmodel.predict(X_test_sentiment)

In [115]:
print("Accuracy:",metrics.accuracy_score(y_test_sentiment, y_pred))
print("Precision:",metrics.precision_score(y_test_sentiment, y_pred, average='micro'))
print("Recall:",metrics.recall_score(y_test_sentiment, y_pred, average='micro'))
print("F-1Score:",metrics.f1_score(y_test_sentiment, y_pred, average='micro'))

tn, fp, fn, tp = confusion_matrix(y_test_sentiment, y_pred).ravel()
(tn, fp, fn, tp)

Accuracy: 0.7580645161290323
Precision: 0.3790322580645161
Recall: 0.5
F-1Score: 0.43119266055045874


  _warn_prf(average, modifier, msg_start, len(result))


(0, 135, 0, 423)

### NB

In [116]:
NBModel = MultinomialNB()
NBModel.fit(X_train_sentiment, y_train_sentiment)

MultinomialNB()

In [117]:
y_pred_nb = NBModel.predict(X_test_sentiment)

In [118]:
print("Accuracy:",metrics.accuracy_score(y_test_sentiment, y_pred_nb))
print("Precision:",metrics.precision_score(y_test_sentiment, y_pred_nb, average='micro'))
print("Recall:",metrics.recall_score(y_test_sentiment, y_pred_nb, average='micro'))
print("F-1Score:",metrics.f1_score(y_test_sentiment, y_pred_nb, average='micro'))

tn, fp, fn, tp = confusion_matrix(y_test_sentiment, y_pred).ravel()
(tn, fp, fn, tp)

Accuracy: 0.7580645161290323
Precision: 0.3790322580645161
Recall: 0.5
F-1Score: 0.43119266055045874


  _warn_prf(average, modifier, msg_start, len(result))


(0, 135, 0, 423)

### Aspect

In [119]:
X_train_aspect = vectors_train
y_train_aspect = train.Aspect

X_test_aspect = vectors
y_test_aspect = test.Aspect

X_train_aspect.shape, X_test_aspect.shape, y_train_aspect.shape, y_test_aspect.shape

((2678, 4), (558, 4), (2678,), (558,))

### LGBM

In [120]:
LGBAspectMmodel = lgb.LGBMClassifier(boosting_type='dart', max_depth=4,num_leaves=8,n_estimators=200)
LGBAspectMmodel.fit(X_train_aspect, y_train_aspect)

LGBMClassifier(boosting_type='dart', max_depth=4, n_estimators=200,
               num_leaves=8)

In [121]:
y_aspect_pred=LGBAspectMmodel.predict(X_test_aspect)

In [122]:
print("Accuracy:",metrics.accuracy_score(y_test_aspect, y_aspect_pred))
print("Precision:",metrics.precision_score(y_test_aspect, y_aspect_pred, average='micro'))
print("Recall:",metrics.recall_score(y_test_aspect, y_aspect_pred, average='micro'))
print("F-1Score:",metrics.f1_score(y_test_aspect, y_aspect_pred, average='micro'))


print(confusion_matrix(y_test_aspect,y_aspect_pred))

Accuracy: 0.5752688172043011
Precision: 0.5787988716661033
Recall: 0.5703269213091183
F-1Score: 0.5699771922118981
[[95  1 14  9  3]
 [ 2 43 16 11 35]
 [12  5 68  3 12]
 [ 2 14  9 44 35]
 [ 1 18 11 24 71]]


### RF

In [123]:
RFAspectModel=RandomForestClassifier(max_depth=40, min_samples_split=4)
RFAspectModel.fit(X_train_aspect,y_train_aspect)

RandomForestClassifier(max_depth=40, min_samples_split=4)

In [124]:
y_aspect_pred=RFAspectModel.predict(X_test_aspect)

In [125]:
print("Accuracy:",metrics.accuracy_score(y_test_aspect, y_aspect_pred))
print("Precision:",metrics.precision_score(y_test_aspect, y_aspect_pred, average='micro'))
print("Recall:",metrics.recall_score(y_test_aspect, y_aspect_pred, average='micro'))
print("F-1Score:",metrics.f1_score(y_test_aspect, y_aspect_pred, average='micro'))

print(confusion_matrix(y_test_aspect,y_aspect_pred))

Accuracy: 0.5734767025089605
Precision: 0.5719923543187767
Recall: 0.5721742654771306
F-1Score: 0.571988420947909
[[97  3 12  4  6]
 [ 4 51 10 16 26]
 [12  7 67  7  7]
 [ 4 19  5 49 27]
 [ 3 32  8 26 56]]


### SVM

In [126]:
SVMAspectModel = svm.SVC(probability=True)
SVMAspectModel.fit(X_train_aspect, y_train_aspect)

SVC(probability=True)

In [127]:
y_aspect_pred = SVMAspectModel.predict(X_test_aspect)

In [128]:
print("Accuracy:",metrics.accuracy_score(y_test_aspect, y_aspect_pred))
print("Precision:",metrics.precision_score(y_test_aspect, y_aspect_pred, average='micro'))
print("Recall:",metrics.recall_score(y_test_aspect, y_aspect_pred, average='micro'))
print("F-1Score:",metrics.f1_score(y_test_aspect, y_aspect_pred, average='micro'))

print(confusion_matrix(y_test_aspect,y_aspect_pred))

Accuracy: 0.578853046594982
Precision: 0.5880874997963605
Recall: 0.5771117724010324
F-1Score: 0.5794394941933981
[[96  2 12  4  8]
 [ 3 52 11  8 33]
 [ 9  8 70  2 11]
 [ 2 18  7 45 32]
 [ 1 32 12 20 60]]


### NB

In [129]:
NBAspectModel = MultinomialNB()
NBAspectModel.fit(X_train_aspect, y_train_aspect)

MultinomialNB()

In [130]:
y_aspect_pred = NBAspectModel.predict(X_test_aspect)

In [131]:
print("Accuracy:",metrics.accuracy_score(y_test_aspect, y_aspect_pred))
print("Precision:",metrics.precision_score(y_test_aspect, y_aspect_pred, average='micro'))
print("Recall:",metrics.recall_score(y_test_aspect, y_aspect_pred, average='micro'))
print("F-1Score:",metrics.f1_score(y_test_aspect, y_aspect_pred, average='micro'))

print(confusion_matrix(y_test_aspect,y_aspect_pred))

Accuracy: 0.514336917562724
Precision: 0.4753905294830264
Recall: 0.49913442622950815
F-1Score: 0.46675593568507984
[[100   0   3   9  10]
 [  6   0   6  12  83]
 [ 12   0  48   7  33]
 [  3   0   1  52  48]
 [ 12   0   3  23  87]]


  _warn_prf(average, modifier, msg_start, len(result))


# Pretrained FastText

### Just load the pretrained but can't extend train

In [5]:
from gensim.models import FastText
ftpremodel = fasttext.load_facebook_vectors("cc.id.300.bin.gz")

In [None]:
# cek kata OOV (Out of Vocab)
'pedulilindungi' in ftpremodel.key_to_index

### Extend Training

In [50]:
from gensim.models import fasttext

In [25]:
fb_model = fasttext.load_facebook_model("cc.id.300.bin.gz")

In [52]:
'covid' in fb_model.wv.key_to_index

False

In [26]:
# Load the data
new_df=test
sentences = [preprocessing_data(r['Opinion']) for i,r in new_df.iterrows()]
new_sentences= [line.rstrip().split() for line in sentences]

In [27]:
fb_model.build_vocab(new_sentences, update=True)
fb_model.train(new_sentences, total_examples=len(new_sentences), epochs=fb_model.epochs)

(657, 3110)

In [45]:
'pedulilindungi' in fb_model.wv.key_to_index

True

In [73]:
iv_term = fb_model.wv['pedulilindungi']
iv_term

array([ 6.35805307e-03,  2.02784175e-03,  3.36957537e-02,  4.61445712e-02,
       -6.47531776e-03, -2.52631977e-02, -9.50048736e-04,  3.95988766e-03,
       -1.03563005e-02, -7.88780153e-02, -6.19595777e-03,  5.14757540e-03,
       -1.20040271e-02, -2.71068346e-02,  9.76053067e-03,  1.34323891e-02,
        1.16706146e-02, -3.95376701e-03,  2.39801016e-02,  9.34991986e-03,
       -4.92930152e-02, -1.03336032e-02,  1.03500271e-02,  6.14447566e-03,
       -7.61731202e-03, -8.82517826e-03,  4.00772411e-03, -7.30433362e-03,
        1.53391045e-02, -6.56296127e-03,  1.64408851e-02, -3.30553181e-03,
        2.27560308e-02, -4.63086274e-03,  1.77889094e-02,  8.45407369e-04,
       -4.96898359e-03,  9.92843974e-03,  2.00049742e-03,  1.35488641e-02,
        2.61881948e-02, -8.92075337e-03, -5.07425703e-03,  6.21333194e-04,
        1.80275366e-02,  1.02109034e-02,  2.80487002e-03, -3.77324224e-03,
        9.48821846e-03, -1.31758712e-02, -7.63672870e-03,  2.99243513e-03,
       -6.04465837e-03, -

In [None]:
fb_model.save("extended_pretrain_ft_model.fasttext")

In [None]:
ft = FastText.load("extended_pretrain_ft_model.fasttext").wv

### Input to classification model

In [132]:
# def norm_vectorize_pretrained(sentence):
#     sentence = preprocessing_data(sentence)
#     vecs = [fb_model.wv[word] for word in sentence]
#     norm_vecs=[vec/np.linalg.norm(vec) for vec in vecs if np.linalg.norm(vec) > 0]
#     sent_vec=np.mean(norm_vecs, axis=0)
#     return sent_vec

def norm_vectorize_pretrained(sentence):
    sentence = preprocessing_data(sentence)
    vecs = [fb_model.wv[word] for word in sentence]
    norm_vecs=[vec/np.linalg.norm(vec) for vec in vecs if np.linalg.norm(vec) > 0]
    sent_vec=abs(np.mean(norm_vecs, axis=0))
    return sent_vec

In [133]:
#Training
vecs_train = [norm_vectorize_pretrained(sentence) for sentence in train.Opinion]
vectors_train = np.array(vecs_train)

#Testing
vecs = [norm_vectorize_pretrained(sentence) for sentence in test.Opinion]
vectors = np.array(vecs)
vectors

array([[0.03218957, 0.05214153, 0.04191422, ..., 0.05862442, 0.00639089,
        0.06410597],
       [0.02933587, 0.05753453, 0.04074798, ..., 0.06371552, 0.00401158,
        0.0466414 ],
       [0.03429855, 0.05343883, 0.03938835, ..., 0.06169917, 0.00141222,
        0.06059257],
       ...,
       [0.03610053, 0.05608287, 0.03832653, ..., 0.05851021, 0.00088633,
        0.05546058],
       [0.0281581 , 0.05824663, 0.03507378, ..., 0.06291854, 0.0031361 ,
        0.05602774],
       [0.03203309, 0.05211883, 0.0370527 , ..., 0.05664467, 0.00704691,
        0.0699681 ]], dtype=float32)

### Sentiment

In [134]:
X_train_sentiment_pretrained = vectors_train
y_train_sentiment_pretrained = train.Sentiment

X_test_sentiment_pretrained = vectors
y_test_sentiment_pretrained = test.Sentiment

# X_train_sentiment_pretrained, X_test_sentiment_pretrained, y_train_sentiment_pretrained, y_test_sentiment_pretrained = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)
X_train_sentiment_pretrained.shape, X_test_sentiment_pretrained.shape, y_train_sentiment_pretrained.shape, y_test_sentiment_pretrained.shape

((2678, 300), (558, 300), (2678,), (558,))

### LGBM

In [135]:
LGBMmodel = lgb.LGBMClassifier(boosting_type='dart', learning_rate=0.8, max_depth=10,num_leaves=10)
LGBMmodel.fit(X_train_sentiment_pretrained, y_train_sentiment_pretrained)

LGBMClassifier(boosting_type='dart', learning_rate=0.8, max_depth=10,
               num_leaves=10)

In [136]:
y_pred=LGBMmodel.predict(X_test_sentiment_pretrained)

In [137]:
print("Accuracy:",metrics.accuracy_score(y_test_sentiment_pretrained, y_pred))
print("Precision:",metrics.precision_score(y_test_sentiment_pretrained, y_pred, average='macro'))
print("Recall:",metrics.recall_score(y_test_sentiment_pretrained, y_pred, average='macro'))
print("F-1 Score:",metrics.f1_score(y_test_sentiment_pretrained, y_pred, average='macro'))

tn, fp, fn, tp = confusion_matrix(y_test_sentiment_pretrained, y_pred).ravel()
(tn, fp, fn, tp)


Accuracy: 0.7526881720430108
Precision: 0.6258266620257571
Recall: 0.5569739952718676
F-1 Score: 0.5548387096774194


(24, 111, 27, 396)

### RF

In [138]:
RFModel=RandomForestClassifier(max_depth=40, min_samples_split=4)
RFModel.fit(X_train_sentiment_pretrained,y_train_sentiment_pretrained)

RandomForestClassifier(max_depth=40, min_samples_split=4)

In [139]:
y_pred=RFModel.predict(X_test_sentiment_pretrained)

In [140]:
print("Accuracy:",metrics.accuracy_score(y_test_sentiment_pretrained, y_pred))
print("Precision:",metrics.precision_score(y_test_sentiment_pretrained, y_pred, average='macro'))
print("Recall:",metrics.recall_score(y_test_sentiment_pretrained, y_pred, average='macro'))
print("F-1 Score:",metrics.f1_score(y_test_sentiment_pretrained, y_pred, average='macro'))

tn, fp, fn, tp = confusion_matrix(y_test_sentiment_pretrained, y_pred).ravel()
(tn, fp, fn, tp)

Accuracy: 0.7670250896057348
Precision: 0.7305575158786168
Recall: 0.5286052009456265
F-1 Score: 0.49366205048023226


(9, 126, 4, 419)

### SVM

In [141]:
SVMmodel = svm.SVC(probability=True)
SVMmodel.fit(X_train_sentiment_pretrained, y_train_sentiment_pretrained)

SVC(probability=True)

In [142]:
y_pred = SVMmodel.predict(X_test_sentiment_pretrained)

In [143]:
print("Accuracy:",metrics.accuracy_score(y_test_sentiment_pretrained, y_pred))
print("Precision:",metrics.precision_score(y_test_sentiment_pretrained, y_pred, average='macro'))
print("Recall:",metrics.recall_score(y_test_sentiment_pretrained, y_pred, average='macro'))
print("F-1 Score:",metrics.f1_score(y_test_sentiment_pretrained, y_pred, average='macro'))

tn, fp, fn, tp = confusion_matrix(y_test_sentiment_pretrained, y_pred).ravel()
(tn, fp, fn, tp)

Accuracy: 0.7580645161290323
Precision: 0.3790322580645161
Recall: 0.5
F-1 Score: 0.43119266055045874


  _warn_prf(average, modifier, msg_start, len(result))


(0, 135, 0, 423)

### NB

In [144]:
NBModel = MultinomialNB()
NBModel.fit(X_train_sentiment_pretrained, y_train_sentiment_pretrained)

MultinomialNB()

In [145]:
y_pred = NBModel.predict(X_test_sentiment_pretrained)

In [146]:
print("Accuracy:",metrics.accuracy_score(y_test_sentiment_pretrained, y_pred))
print("Precision:",metrics.precision_score(y_test_sentiment_pretrained, y_pred, average='macro'))
print("Recall:",metrics.recall_score(y_test_sentiment_pretrained, y_pred, average='macro'))
print("F-1 Score:",metrics.f1_score(y_test_sentiment_pretrained, y_pred, average='macro'))

tn, fp, fn, tp = confusion_matrix(y_test_sentiment_pretrained, y_pred).ravel()
(tn, fp, fn, tp)

Accuracy: 0.7580645161290323
Precision: 0.3790322580645161
Recall: 0.5
F-1 Score: 0.43119266055045874


  _warn_prf(average, modifier, msg_start, len(result))


(0, 135, 0, 423)

### Aspect

In [147]:
X_train_aspect_pretrained = vectors_train
y_train_aspect_pretrained =  train.Aspect

X_test_aspect_pretrained = vectors
y_test_aspect_pretrained = test.Aspect

# X_train_aspect_pretrained, X_test_aspect_pretrained, y_train_aspect_pretrained, y_test_aspect_pretrained = train_test_split(X,y,test_size=0.2,stratify=y,random_state=42)
X_train_aspect_pretrained.shape, X_test_aspect_pretrained.shape, y_train_aspect_pretrained.shape, y_test_aspect_pretrained.shape

((2678, 300), (558, 300), (2678,), (558,))

### LGBM

In [148]:
LGBAspectMmodel = lgb.LGBMClassifier(boosting_type='dart', max_depth=4,num_leaves=8,n_estimators=200)
LGBAspectMmodel.fit(X_train_aspect_pretrained, y_train_aspect_pretrained)

LGBMClassifier(boosting_type='dart', max_depth=4, n_estimators=200,
               num_leaves=8)

In [149]:
y_aspect_pred=LGBAspectMmodel.predict(X_test_aspect_pretrained)

In [150]:
print("Accuracy:",metrics.accuracy_score(y_test_aspect_pretrained, y_aspect_pred))
print("Precision:",metrics.precision_score(y_test_aspect_pretrained, y_aspect_pred, average='macro'))
print("Recall:",metrics.recall_score(y_test_aspect_pretrained, y_aspect_pred, average='macro'))
print("F-1Score:",metrics.f1_score(y_test_aspect_pretrained, y_aspect_pred, average='macro'))
print("Precision per class:",metrics.precision_score(y_test_aspect_pretrained, y_aspect_pred, average=None))

print(confusion_matrix(y_test_aspect_pretrained,y_aspect_pred))

Accuracy: 0.6200716845878136
Precision: 0.6251301207934691
Recall: 0.6205257899140848
F-1Score: 0.619942974456033
Precision per class: [0.8125     0.59813084 0.66956522 0.44545455 0.6       ]
[[78 10 11 10 13]
 [ 4 64  4 27  8]
 [ 2  0 77  8 13]
 [ 6 22  9 49 18]
 [ 6 11 14 16 78]]


### RF

In [151]:
#Create a Aspect Classifier
RFAspectModel=RandomForestClassifier(max_depth=40)
RFAspectModel.fit(X_train_aspect_pretrained,y_train_aspect_pretrained)

RandomForestClassifier(max_depth=40)

In [152]:
y_aspect_pred=RFAspectModel.predict(X_test_aspect_pretrained)

In [153]:
print("Accuracy:",metrics.accuracy_score(y_test_aspect_pretrained, y_aspect_pred))
print("Precision:",metrics.precision_score(y_test_aspect_pretrained, y_aspect_pred, average='macro'))
print("Recall:",metrics.recall_score(y_test_aspect_pretrained, y_aspect_pred, average='macro'))
print("F-1Score:",metrics.f1_score(y_test_aspect_pretrained, y_aspect_pred, average='macro'))
print("Precision per class:",metrics.precision_score(y_test_aspect_pretrained, y_aspect_pred, average=None))

print(confusion_matrix(y_test_aspect_pretrained,y_aspect_pred))

Accuracy: 0.6236559139784946
Precision: 0.6278641297218372
Recall: 0.6230353914508963
F-1Score: 0.6231700491010874
Precision per class: [0.80808081 0.53913043 0.66666667 0.46428571 0.66115702]
[[80 13  8 11 10]
 [ 5 62  7 26  7]
 [ 5  3 74  6 12]
 [ 3 27 10 52 12]
 [ 6 10 12 17 80]]


### SVM

In [154]:
SVMAspectModel = svm.SVC(probability=True)
SVMAspectModel.fit(X_train_aspect_pretrained, y_train_aspect_pretrained)

SVC(probability=True)

In [155]:
y_aspect_pred = SVMAspectModel.predict(X_test_aspect_pretrained)

In [156]:
print("Accuracy:",metrics.accuracy_score(y_test_aspect_pretrained, y_aspect_pred))
print("Precision:",metrics.precision_score(y_test_aspect_pretrained, y_aspect_pred, average='macro'))
print("Recall:",metrics.recall_score(y_test_aspect_pretrained, y_aspect_pred, average='macro'))
print("F-1Score:",metrics.f1_score(y_test_aspect_pretrained, y_aspect_pred, average='macro'))
print("Precision per class:",metrics.precision_score(y_test_aspect_pretrained, y_aspect_pred, average=None))

print(confusion_matrix(y_test_aspect_pretrained,y_aspect_pred))

Accuracy: 0.5770609318996416
Precision: 0.5821552608971158
Recall: 0.5750881191736102
F-1Score: 0.5754624762644891
Precision per class: [0.75       0.56989247 0.54545455 0.40983607 0.63559322]
[[78  6 12 17  9]
 [ 7 53 12 28  7]
 [ 3  6 66  9 16]
 [ 6 24 13 50 11]
 [10  4 18 18 75]]


### NB

In [157]:
NBAspectModel = MultinomialNB()
NBAspectModel.fit(X_train_aspect_pretrained, y_train_aspect_pretrained)

MultinomialNB()

In [158]:
y_aspect_pred = NBAspectModel.predict(X_test_aspect_pretrained)

In [159]:
print("Accuracy:",metrics.accuracy_score(y_test_aspect_pretrained, y_aspect_pred))
print("Precision:",metrics.precision_score(y_test_aspect_pretrained, y_aspect_pred, average='macro'))
print("Recall:",metrics.recall_score(y_test_aspect_pretrained, y_aspect_pred, average='macro'))
print("F-1Score:",metrics.f1_score(y_test_aspect_pretrained, y_aspect_pred, average='macro'))
print("Precision per class:",metrics.precision_score(y_test_aspect_pretrained, y_aspect_pred, average=None))

print(confusion_matrix(y_test_aspect_pretrained,y_aspect_pred))

Accuracy: 0.4014336917562724
Precision: 0.432216888038013
Recall: 0.3742715371651483
F-1Score: 0.3090408022005871
Precision per class: [0.48387097 0.42857143 0.53846154 0.36       0.35018051]
[[90  0  1  5 26]
 [25  3  2 29 48]
 [30  1  7  9 53]
 [19  3  2 27 53]
 [22  0  1  5 97]]
