In [2]:
import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors
from sklearn.impute import KNNImputer
from sklearn.metrics import log_loss
from sklearn.model_selection import RepeatedKFold
%matplotlib inline

In [3]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,reviews_content,category
0,airplane ! is considered among many to be the ...,positive
1,you've got to love disney . \nno matter what t...,positive
2,""" the tailor of panama "" is a different kind ...",positive
3,"the characters in jonathan lynn's "" the whole ...",negative
4,"vikings v . bears ? \nno , this isn't the line...",negative
...,...,...
1495,"trekkies , roger nygard's energetic and hilari...",positive
1496,""" dangerous beauty "" is a really nothing more...",positive
1497,starring shawnee smith ; donovan leitch ; rick...,negative
1498,"man , this was one wierd movie . \nsimilar to ...",negative


In [4]:
df['reviews_content'][0]

'airplane ! is considered among many to be the epitome of satire film-making . \nafter all , it\'s brought to us by one of the best known satire writing/directing teams . \neven if most people don\'t recognize the names behind the films , they are bound to recognize the titles : airplane ! , top secret , the naked gun , and hot shots to name a few . \nbut although the zucker/abrahams/zucker team was first introduced with the kentucky fried movie in 1977 , airplane ! \nremains the true cornerstone of their work , and their directorial debuts . \nin the seventies , disaster films seemed to be at an all time high . \nfilms like earthquake , the towering inferno , and the poseidon adventure were big hits . \nthere was also a series about the disasters that can arise when traveling by plane - a series that spanned the entire decade . \nand so , in 1980 , we were introduced to a new airplane disaster film . \nthis time the disaster had nothing to do with a maniacal hijacker or crashing into 

In [5]:
df.isnull().sum()

reviews_content    0
category           0
dtype: int64

In [6]:
df['category'].value_counts()

positive    752
negative    748
Name: category, dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 2 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   reviews_content  1500 non-null   object
 1   category         1500 non-null   object
dtypes: object(2)
memory usage: 23.6+ KB


In [12]:
import re

def clean_text(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = re.sub(r'\n', '', text)
    text = text.strip(' ')
    return text

In [13]:
df['reviews_content'] = df['reviews_content'].map(lambda com : clean_text(com))
df['reviews_content'][0]

'airplane is considered among many to be the epitome of satire film making after all it brought to us by one of the best known satire writing directing teams even if most people do not recognize the names behind the films they are bound to recognize the titles airplane top secret the naked gun and hot shots to name a few but although the zucker abrahams zucker team was first introduced with the kentucky fried movie in 1977 airplane remains the true cornerstone of their work and their directorial debuts in the seventies disaster films seemed to be at an all time high films like earthquake the towering inferno and the poseidon adventure were big hits there was also a series about the disasters that can arise when traveling by plane a series that spanned the entire decade and so in 1980 we were introduced to a new airplane disaster film this time the disaster had nothing to do with a maniacal hijacker or crashing into the ocean it had to do with bad fish airplane is the story of ted strik

In [16]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

stop_words = stopwords.words('english')
df['reviews_content'] = df['reviews_content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df['reviews_content'][0]

'airplane considered among many epitome satire film making brought us one best known satire writing directing teams even people recognize names behind films bound recognize titles airplane top secret naked gun hot shots name although zucker abrahams zucker team first introduced kentucky fried movie 1977 airplane remains true cornerstone work directorial debuts seventies disaster films seemed time high films like earthquake towering inferno poseidon adventure big hits also series disasters arise traveling plane series spanned entire decade 1980 introduced new airplane disaster film time disaster nothing maniacal hijacker crashing ocean bad fish airplane story ted striker robert hays ex fighter pilot never gotten fact decision make midst war led death six er seven comrades unable stop living past ted world fell apart spent time moving city city without ever stable job eventually begin film getting dumped lover flight attendant elaine dickinson julie hagerty desperate attempt lose ted buy

In [22]:
y = np.array(df['category'])
X = df['reviews_content']

X.shape, y.shape

((1500,), (1500,))

In [23]:
X

0       airplane considered among many epitome satire ...
1       got love disney matter serve guaranteed succes...
2       tailor panama different kind spy movie despite...
3       characters jonathan lynn whole nine yards yet ...
4       vikings v bears lineup monday night football r...
                              ...                        
1495    trekkies roger nygard energetic hilarious docu...
1496    dangerous beauty really nothing grandiose soap...
1497    starring shawnee smith donovan leitch ricky pa...
1498    man one wierd movie similar conspiracy theory ...
1499    review ghost dog way samurai 1999 cast forest ...
Name: reviews_content, Length: 1500, dtype: object

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from keras import layers
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential

In [26]:
# Inisialisasi Tokenizer
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")

# Fitting tokenizer pada teks
tokenizer.fit_on_texts(X)

# Mengubah teks menjadi sequence
sequences = tokenizer.texts_to_sequences(X)

# Padding sequences untuk memastikan panjang yang seragam
padded_sequences = pad_sequences(sequences, padding='post')

In [27]:
padded_sequences

array([[2474, 1299,  499, ...,    0,    0,    0],
       [ 181,   44,  318, ...,    0,    0,    0],
       [   1,    1,  193, ...,    0,    0,    0],
       ...,
       [ 680,    1,  496, ...,    0,    0,    0],
       [  31,    3,    1, ...,    0,    0,    0],
       [ 342, 1319,  401, ...,    0,    0,    0]])

In [28]:
labels_encoded = [1 if label == 'positive' else 0 for label in y]

In [40]:
labels_encoded

array([1, 1, 1, ..., 0, 0, 1])

In [39]:
labels_encoded = np.array(labels_encoded)

In [41]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels_encoded, test_size=0.2, random_state=42)

X_train.shape, X_test.shape, len(y_train), len(y_test)

((1200, 1173), (300, 1173), 1200, 300)

In [45]:
from keras.layers import Dropout

# Create model
# Create model
model = Sequential([
    Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=16, input_length=padded_sequences.shape[1]),
    GlobalAveragePooling1D(),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(16, activation='relu'),
    Dropout(0.2),
    Dense(1, activation='sigmoid')
])

# Kompilasi model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Ringkasan model
model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 1173, 16)          558304    
                                                                 
 global_average_pooling1d_5   (None, 16)               0         
 (GlobalAveragePooling1D)                                        
                                                                 
 dense_13 (Dense)            (None, 32)                544       
                                                                 
 dropout_8 (Dropout)         (None, 32)                0         
                                                                 
 dense_14 (Dense)            (None, 16)                528       
                                                                 
 dropout_9 (Dropout)         (None, 16)                0         
                                                      

In [47]:
# training the model

model.fit(X_train, y_train, epochs=26)

# evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print('Accuracy: %2f' % (accuracy*100))

Epoch 1/26
Epoch 2/26
Epoch 3/26
Epoch 4/26
Epoch 5/26
Epoch 6/26
Epoch 7/26
Epoch 8/26
Epoch 9/26
Epoch 10/26
Epoch 11/26
Epoch 12/26
Epoch 13/26
Epoch 14/26
Epoch 15/26
Epoch 16/26
Epoch 17/26
Epoch 18/26
Epoch 19/26
Epoch 20/26
Epoch 21/26
Epoch 22/26
Epoch 23/26
Epoch 24/26
Epoch 25/26
Epoch 26/26
Accuracy: 83.999997


In [52]:
# Teks baru untuk prediksi (sudah di-stem)
test = pd.read_csv('test.csv')
test.shape
# shape = 500,1

(500, 1)

In [53]:
test.isna().sum()

reviews_content    0
dtype: int64

In [56]:
test['reviews_content'] = test['reviews_content'].map(lambda com : clean_text(com))
test['reviews_content'] = test['reviews_content'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
test_sequences = tokenizer.texts_to_sequences(test['reviews_content'])
test_padded = pad_sequences(test_sequences, padding='post', maxlen=padded_sequences.shape[1])

In [57]:
predictions = model.predict(test_padded)



In [58]:
predictions

array([[9.70628023e-01],
       [9.99269783e-01],
       [9.92076993e-01],
       [5.56714892e-01],
       [9.53499496e-01],
       [9.95880127e-01],
       [4.20221277e-02],
       [7.99202025e-01],
       [9.99635339e-01],
       [6.25783727e-02],
       [9.92637157e-01],
       [9.99719322e-01],
       [8.96980643e-01],
       [2.03000905e-04],
       [1.04675512e-03],
       [4.08483446e-01],
       [9.99995053e-01],
       [3.19874100e-03],
       [8.01676154e-01],
       [6.65995240e-01],
       [9.99802709e-01],
       [6.55419251e-04],
       [3.95284005e-05],
       [1.98510970e-05],
       [9.99932051e-01],
       [9.61971641e-01],
       [3.94105524e-01],
       [9.88274753e-01],
       [8.59084502e-02],
       [9.99925852e-01],
       [9.99429405e-01],
       [1.13307615e-03],
       [9.87504027e-04],
       [9.99999881e-01],
       [8.86806461e-04],
       [6.29561394e-02],
       [9.99932051e-01],
       [3.10528667e-05],
       [1.52554393e-01],
       [9.86973524e-01],


In [59]:
# Ambang batas
threshold = 0.5

# Mengonversi probabilitas menjadi label kelas
predicted_labels = ['positive' if prob > threshold else 'negative' for prob in predictions]
predictions = pd.DataFrame(predicted_labels, columns=['category'])
predictions


Unnamed: 0,category
0,positive
1,positive
2,positive
3,positive
4,positive
...,...
495,positive
496,positive
497,positive
498,positive


In [60]:
predictions.value_counts()

category
negative    255
positive    245
dtype: int64

In [61]:
predictions.to_csv('BDC007_submission.csv', index=False)