In [1]:
import pandas as pd
import nltk
import sys

### Load CSV

In [2]:
data_train = pd.read_csv('data_train.csv', encoding='latin-1')

In [3]:
data_test = pd.read_csv('data_test.csv',  encoding='latin-1')

In [4]:
data_test_w_sent = pd.read_csv('data_test_with_sentiment.csv',  encoding='latin-1')

### Sampling 
##### Because the data is not balanced

In [5]:
train_sample = data_train.loc[data_train['reviews.sentiment'] == 'negative'].sample(n=1942, random_state=1234)
train_sample_positive = data_train.loc[data_train['reviews.sentiment'] == 'positive'].sample(n=1942, random_state=1234)
train_sample = train_sample.append(train_sample_positive)

### Word Preparation

#### 1. Remove stopwords

In [0]:
from nltk.corpus import stopwords

In [0]:
stop_words = set(stopwords.words('english'))

In [0]:
filtered_sentences = []
for i,words in enumerate(train_sample['reviews.text']):
    filtered = [word for word in words.split() if word not in stop_words]
    filtered_sentences.append(' '.join(filtered))

# add new column to store the sentences which stopwords has been removed
train_sample['filtered.reviews'] = pd.Series(filtered_sentences, index=train_sample.index)

#save filtered sentence data to csv
train_sample.to_csv('filtered_train.csv', index=False)

In [0]:
filtered_sentences_test = []
for i,words in enumerate(data_test['reviews.text']):
    filtered = [word for word in words.split() if word not in stop_words]
    filtered_sentences_test.append(' '.join(filtered))

# add new column to store the sentences which stopwords has been removed
data_test['filtered.reviews'] = pd.Series(filtered_sentences_test, index=data_test.index)

#save filtered sentence data to csv
data_test.to_csv('filtered_test.csv', index=False)

In [0]:
filtered_sentences_test_sent = []
for i,words in enumerate(data_test_w_sent['reviews.text']):
    filtered = [word for word in words.split() if word not in stop_words]
    filtered_sentences_test_sent.append(' '.join(filtered))

# add new column to store the sentences which stopwords has been removed
data_test_w_sent['filtered.reviews'] = pd.Series(filtered_sentences_test_sent, index=data_test_w_sent.index)

#save filtered sentence data to csv
data_test_w_sent.to_csv('filtered_test_sent.csv', index=False)

#### 2. Remove punctuation + lemmatization

In [0]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import re

lem = WordNetLemmatizer()

In [0]:
filtered_train = train_sample['reviews.text']
clean = []
for i in filtered_train:
    clean_str = re.sub(r'[^\w\s]',' ', i) #punctual removal
    clean_str = re.sub('\s+', ' ', clean_str) #remove extra space
    words = word_tokenize(clean_str)
    for w in range(len(words)):
        words[w] = lem.lemmatize(words[w])
    sentence = " ".join(words)

    clean.append(sentence)
train_sample['lemma'] = clean

In [0]:
filtered_test = data_test['reviews.text']
clean_test = []
for i in filtered_test:
    clean_str = re.sub(r'[^\w\s]',' ', i) #punctual removal
    clean_str = re.sub('\s+', ' ', clean_str) #remove extra space
    words = word_tokenize(clean_str)
    for w in range(len(words)):
        words[w] = lem.lemmatize(words[w])
    sentence = " ".join(words)

    clean_test.append(sentence)
data_test['lemma'] = clean_test

In [0]:
filtered_test_sent = data_test_w_sent['reviews.text']
clean_test = []
for i in filtered_test_sent:
    clean_str = re.sub(r'[^\w\s]',' ', i) #punctual removal
    clean_str = re.sub('\s+', ' ', clean_str) #remove extra space
    words = word_tokenize(clean_str)
    for w in range(len(words)):
        words[w] = lem.lemmatize(words[w])
    sentence = " ".join(words)

    clean_test.append(sentence)
data_test_w_sent['lemma'] = clean_test

#### 3. Stemming

In [6]:
# snowball stemmer
from nltk.stem.snowball import SnowballStemmer

In [7]:
stemmer = SnowballStemmer("english")

In [8]:
stemmed_train = []
for words in train_sample['reviews.text']:
    #store each words that have been stemmed in an array of stems
    stems = []
    for word in words.split():
        stem = stemmer.stem(word)
        stems.append(stem)
    #store each sentences from stems in an array of stemmed_train
    stemmed_train.append(' '.join(stems))

train_sample['stemmed.reviews'] = pd.Series(stemmed_train, index=train_sample.index)
#train_sample.to_csv('stem_train.csv', index=False)

In [9]:
stemmed_test = []
for words in data_test['reviews.text']:
    #store each words that have been stemmed in an array of stems
    stems = []
    for word in words.split():
        stem = stemmer.stem(word)
        stems.append(stem)
    #store each sentences from stems in an array of stemmed_train
    stemmed_test.append(' '.join(stems))
data_test['stemmed_reviews'] = pd.Series(stemmed_test, index=data_test.index)
#data_test.to_csv('stem_test.csv', index=False)

In [10]:
stemmed_test_sent = []
for words in data_test_w_sent['reviews.text']:
    #store each words that have been stemmed in an array of stems
    stems = []
    for word in words.split():
        stem = stemmer.stem(word)
        stems.append(stem)
    #store each sentences from stems in an array of stemmed_train
    stemmed_test_sent.append(' '.join(stems))
data_test_w_sent['stemmed.reviews'] = pd.Series(stemmed_test_sent, index=data_test_w_sent.index)
#data_test_w_sent.to_csv('stem_test_w_sent.csv', index=False)

### Feature Extraction

#### 1. TFIDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
vectorizer = TfidfVectorizer(ngram_range=(1,1))
X = vectorizer.fit_transform(train_sample['stemmed.reviews'])
X_test = vectorizer.transform(data_test['stemmed_reviews'])

In [0]:
print(vectorizer.vocabulary_)

#### 2. Count Vectorizer

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

In [0]:
cv = CountVectorizer(analyzer='word', ngram_range=(1,1))
cv.fit(train_sample['filtered.reviews'])
X_cv = cv.transform(train_sample['filtered.reviews'])
X_test_cv = cv.transform(data_test_w_sent['filtered.reviews'])

### Training
##### Training data is splitted into train & test data for validation before we test the model with the real testing data

In [15]:
train_y = train_sample['stemmed.reviews']
test_y = data_test['stemmed_reviews']

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
validation_size = 0.30
seed = 1234 #generate same sample
X_train, X_validation, Y_train, Y_validation = train_test_split(X, train_y, test_size=validation_size, random_state=seed)

#### Naive Bayes

In [18]:
from sklearn import naive_bayes
import sklearn.metrics

In [19]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support, classification_report 

nb = naive_bayes.MultinomialNB()
nb.fit(X_train, Y_train)

pred_y_train = nb.predict(X_train)
pred_y_test = nb.predict(X_validation)

accuracy_train = accuracy_score(Y_train, pred_y_train)
accuracy_test = accuracy_score(Y_validation, pred_y_test)
print("Accuracy training :" + str(accuracy_train))
print("Accuracy testing :" + str(accuracy_test))
print("Confusion Matriks Data Training :\n" + str(sklearn.metrics.confusion_matrix(Y_train, pred_y_train)))
print("Confusion Matriks Data Testing :\n" + str(sklearn.metrics.confusion_matrix(Y_validation, pred_y_test)))

Accuracy training :0.8727005150846211
Accuracy testing :0.0008576329331046312
Confusion Matriks Data Training :
[[1 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 1]]


Confusion Matriks Data Testing :
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


#### Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression

In [21]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
y_train_pred = logreg.predict(X_train)
y_val_pred = logreg.predict(X_validation)
print("Akurasi Data Training : " + str(logreg.score(X_train, Y_train)))
print("Akurasi Data Testing : " + str(logreg.score(X_validation, Y_validation)))
print("Confusion Matriks Data Training :\n" + str(sklearn.metrics.confusion_matrix(Y_train, y_train_pred)))
print("Confusion Matriks Data Testing :\n" + str(sklearn.metrics.confusion_matrix(Y_validation, y_val_pred)))



Akurasi Data Training : 0.9988962472406181
Akurasi Data Testing : 0.0008576329331046312


Confusion Matriks Data Training :
[[1 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 1]]
Confusion Matriks Data Testing :
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


#### Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier
randomforest = RandomForestClassifier(random_state=seed)
randomforest.fit(X_train, Y_train)

pred_y_train = randomforest.predict(X_train)
pred_y_test = randomforest.predict(X_validation)

accuracy_train = accuracy_score(Y_train, pred_y_train)
accuracy_test = accuracy_score(Y_validation, pred_y_test)
print("Accuracy training :" + str(accuracy_train))
print("Accuracy testing :" + str(accuracy_test))
print("Confusion Matriks Data Training :\n" + str(sklearn.metrics.confusion_matrix(Y_train, y_train_pred)))
print("Confusion Matriks Data Testing :\n" + str(sklearn.metrics.confusion_matrix(Y_validation, y_val_pred)))

#### Decision Tree

In [0]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)

pred_y_train = dt.predict(X_train)
pred_y_test = dt.predict(X_validation)

accuracy_train = accuracy_score(Y_train, pred_y_train)
accuracy_test = accuracy_score(Y_validation, pred_y_test)
print("Accuracy training :" + str(accuracy_train))
print("Accuracy testing :" + str(accuracy_test))
print("Confusion Matriks Data Training :\n" + str(sklearn.metrics.confusion_matrix(Y_train, y_train_pred)))
print("Confusion Matriks Data Testing :\n" + str(sklearn.metrics.confusion_matrix(Y_validation, y_val_pred)))

#### SVM

##### Linear Kernel

In [22]:
from sklearn.svm import SVC
svc = SVC(kernel="linear")
svc.fit(X_train, Y_train)
pred_y_train = svc.predict(X_train)
pred_y_test = svc.predict(X_validation)

accuracy_train = accuracy_score(Y_train, pred_y_train)
accuracy_test = accuracy_score(Y_validation, pred_y_test)
print("Accuracy training :" + str(accuracy_train))
print("Accuracy testing :" + str(accuracy_test))
print("Confusion Matriks Data Training :\n" + str(sklearn.metrics.confusion_matrix(Y_train, y_train_pred)))
print("Confusion Matriks Data Testing :\n" + str(sklearn.metrics.confusion_matrix(Y_validation, y_val_pred)))

Accuracy training :0.9992641648270787
Accuracy testing :0.0008576329331046312
Confusion Matriks Data Training :
[[1 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 1]]
Confusion Matriks Data Testing :
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


##### Radial Basis Function (RBF) Kernel

In [0]:
from sklearn.svm import SVC
svc_rbf = SVC()
svc_rbf.fit(X_train, Y_train)
pred_y_train = svc_rbf.predict(X_train)
pred_y_test = svc_rbf.predict(X_validation)

accuracy_train = accuracy_score(Y_train, pred_y_train)
accuracy_test = accuracy_score(Y_validation, pred_y_test)
print("Accuracy training :" + str(accuracy_train))
print("Accuracy testing :" + str(accuracy_test))
print("Confusion Matriks Data Training :\n" + str(sklearn.metrics.confusion_matrix(Y_train, y_train_pred)))
print("Confusion Matriks Data Testing :\n" + str(sklearn.metrics.confusion_matrix(Y_validation, y_val_pred)))

##### Polynomial Kernel

In [0]:
from sklearn.svm import SVC
svc_poly = SVC(kernel='poly')
svc_poly.fit(X_train, Y_train)
pred_y_train = svc_poly.predict(X_train)
pred_y_test = svc_poly.predict(X_validation)

accuracy_train = accuracy_score(Y_train, pred_y_train)
accuracy_test = accuracy_score(Y_validation, pred_y_test)
print("Accuracy training :" + str(accuracy_train))
print("Accuracy testing :" + str(accuracy_test))
print("Confusion Matriks Data Training :\n" + str(sklearn.metrics.confusion_matrix(Y_train, y_train_pred)))
print("Confusion Matriks Data Testing :\n" + str(sklearn.metrics.confusion_matrix(Y_validation, y_val_pred)))

##### Sigmoid Kernel

In [0]:
from sklearn.svm import SVC
svc_sig = SVC(kernel='sigmoid')
svc_sig.fit(X_train, Y_train)
pred_y_train = svc_sig.predict(X_train)
pred_y_test = svc_sig.predict(X_validation)

accuracy_train = accuracy_score(Y_train, pred_y_train)
accuracy_test = accuracy_score(Y_validation, pred_y_test)
print("Accuracy training :" + str(accuracy_train))
print("Accuracy testing :" + str(accuracy_test))
print("Confusion Matriks Data Training :\n" + str(sklearn.metrics.confusion_matrix(Y_train, y_train_pred)))
print("Confusion Matriks Data Testing :\n" + str(sklearn.metrics.confusion_matrix(Y_validation, y_val_pred)))

#### Testing with real data (data_test)

In [0]:
import pickle

In [0]:
saved_model = pickle.dumps(logreg)

In [0]:
# Load the pickled model
classifier_from_pickle = pickle.loads(saved_model)
# Use the loaded pickled model to make predictions of data_test
x_test_pred = classifier_from_pickle.predict(X_test_cv)
#accuracy
result = classifier_from_pickle.score(X_test_cv, test_y)
print(result)

### Word Cloud Representation

In [0]:
positif = train_sample["filtered.reviews"][train_sample["reviews.sentiment"]=='positive'] 
negatif = train_sample["filtered.reviews"][train_sample["reviews.sentiment"]=='negative'] 

In [0]:
from wordcloud import WordCloud 
import matplotlib.pyplot as plt 
wordcloud = WordCloud(random_state=seed).generate(" ".join(positif)) 
plt.figure(figsize = (15,7)) 
plt.imshow(wordcloud,interpolation = 'bilinear') 
plt.axis("off") 
plt.show()

In [0]:
from wordcloud import WordCloud 
import matplotlib.pyplot as plt 
wordcloud = WordCloud(random_state=seed).generate(" ".join(negatif)) 
plt.figure(figsize = (15,7)) 
plt.imshow(wordcloud,interpolation = 'bilinear') 
plt.axis("off") 
plt.show()