In [1]:
import pandas as pd
import nltk
import sys

### Load CSV

In [3]:
data_train = pd.read_csv('data_train.csv', encoding='latin-1')

In [4]:
data_test = pd.read_csv('data_test.csv',  encoding='latin-1')

In [5]:
data_test_w_sent = pd.read_csv('data_test_with_sentiment.csv',  encoding='latin-1')

### Sampling 
##### Because the data is not balanced

In [6]:
train_sample = data_train.loc[data_train['reviews.sentiment'] == 'negative'].sample(n=1942, random_state=1234)
train_sample_positive = data_train.loc[data_train['reviews.sentiment'] == 'positive'].sample(n=1942, random_state=1234)
train_sample = train_sample.append(train_sample_positive)

### Word Preparation

#### 1. Remove stopwords

In [0]:
from nltk.corpus import stopwords

In [0]:
stop_words = set(stopwords.words('english'))

In [0]:
filtered_sentences = []
for i,words in enumerate(train_sample['reviews.text']):
    filtered = [word for word in words.split() if word not in stop_words]
    filtered_sentences.append(' '.join(filtered))

# add new column to store the sentences which stopwords has been removed
train_sample['filtered.reviews'] = pd.Series(filtered_sentences, index=train_sample.index)

#save filtered sentence data to csv
train_sample.to_csv('filtered_train.csv', index=False)

In [0]:
filtered_sentences_test = []
for i,words in enumerate(data_test['reviews.text']):
    filtered = [word for word in words.split() if word not in stop_words]
    filtered_sentences_test.append(' '.join(filtered))

# add new column to store the sentences which stopwords has been removed
data_test['filtered.reviews'] = pd.Series(filtered_sentences_test, index=data_test.index)

#save filtered sentence data to csv
data_test.to_csv('filtered_test.csv', index=False)

In [0]:
filtered_sentences_test_sent = []
for i,words in enumerate(data_test_w_sent['reviews.text']):
    filtered = [word for word in words.split() if word not in stop_words]
    filtered_sentences_test_sent.append(' '.join(filtered))

# add new column to store the sentences which stopwords has been removed
data_test_w_sent['filtered.reviews'] = pd.Series(filtered_sentences_test_sent, index=data_test_w_sent.index)

#save filtered sentence data to csv
data_test_w_sent.to_csv('filtered_test_sent.csv', index=False)

#### 2. Remove punctuation + lemmatization

In [0]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import re

lem = WordNetLemmatizer()

In [0]:
filtered_train = train_sample['filtered.reviews']
clean = []
for i in filtered_train:
    clean_str = re.sub(r'[^\w\s]',' ', i) #punctual removal
    clean_str = re.sub('\s+', ' ', clean_str) #remove extra space
    words = word_tokenize(clean_str)
    for w in range(len(words)):
        words[w] = lem.lemmatize(words[w])
    sentence = " ".join(words)

    clean.append(sentence)
train_sample['lemma'] = clean

In [0]:
filtered_test = data_test['filtered.reviews']
clean_test = []
for i in filtered_test:
    clean_str = re.sub(r'[^\w\s]',' ', i) #punctual removal
    clean_str = re.sub('\s+', ' ', clean_str) #remove extra space
    words = word_tokenize(clean_str)
    for w in range(len(words)):
        words[w] = lem.lemmatize(words[w])
    sentence = " ".join(words)

    clean_test.append(sentence)
data_test['lemma'] = clean_test

In [0]:
filtered_test_sent = data_test_w_sent['filtered.reviews']
clean_test = []
for i in filtered_test_sent:
    clean_str = re.sub(r'[^\w\s]',' ', i) #punctual removal
    clean_str = re.sub('\s+', ' ', clean_str) #remove extra space
    words = word_tokenize(clean_str)
    for w in range(len(words)):
        words[w] = lem.lemmatize(words[w])
    sentence = " ".join(words)

    clean_test.append(sentence)
data_test_w_sent['lemma'] = clean_test

#### 3. Stemming

In [0]:
# snowball stemmer
from nltk.stem.snowball import SnowballStemmer

In [0]:
stemmer = SnowballStemmer("english")

In [0]:
stemmed_train = []
for words in train_Sample['filtered.reviews']:
    #store each words that have been stemmed in an array of stems
    stems = []
    for word in words.split():
        stem = stemmer.stem(word)
        stems.append(stem)
    #store each sentences from stems in an array of stemmed_train
    stemmed_train.append(' '.join(stems))

data_train['stemmed.reviews'] = pd.Series(stemmed_train, index=data_train.index)
#train_sample.to_csv('stem_train.csv', index=False)

In [0]:
stemmed_test = []
for words in data_test['filtered_reviews']:
    #store each words that have been stemmed in an array of stems
    stems = []
    for word in words.split():
        stem = stemmer.stem(word)
        stems.append(stem)
    #store each sentences from stems in an array of stemmed_train
    stemmed_test.append(' '.join(stems))
data_test['stemmed_reviews'] = pd.Series(stemmed_test, index=data_test.index)
#data_test.to_csv('stem_test.csv', index=False)

In [0]:
stemmed_test_sent = []
for words in data_test_w_sent['filtered_reviews']:
    #store each words that have been stemmed in an array of stems
    stems = []
    for word in words.split():
        stem = stemmer.stem(word)
        stems.append(stem)
    #store each sentences from stems in an array of stemmed_train
    stemmed_test_sent.append(' '.join(stems))
data_test_w_sent['stemmed_reviews'] = pd.Series(stemmed_test_sent, index=data_test_w_sent.index)
#data_test_w_sent.to_csv('stem_test_w_sent.csv', index=False)

### Feature Extraction

#### 1. TFIDF

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
vectorizer = TfidfVectorizer(ngram_range=(1,1))
X = vectorizer.fit_transform(train_sample['reviews.text'])
X_test = vectorizer.transform(data_test_w_sent['reviews.text'])

#### 2. Count Vectorizer

In [29]:
from sklearn.feature_extraction.text import CountVectorizer

In [31]:
cv = CountVectorizer(analyzer='word', ngram_range=(1,1))
cv.fit(train_sample['reviews.text'])
X_cv = cv.transform(train_sample['reviews.text'])
X_test_cv = cv.fit_transform(data_test_w_sent['reviews.text'])

### Training
##### Training data is splitted into train & test data for validation before we test the model with the real testing data

In [10]:
train_y = train_sample['reviews.sentiment']
test_y = data_test_w_sent['reviews.sentiment']

In [12]:
from sklearn.model_selection import train_test_split

In [32]:
validation_size = 0.30
seed = 1234 #generate same sample
X_train, X_validation, Y_train, Y_validation = train_test_split(X_cv, train_y, test_size=validation_size, random_state=seed)

#### Naive Bayes

In [16]:
from sklearn import naive_bayes
import sklearn.metrics

In [33]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support, classification_report 

nb = naive_bayes.MultinomialNB()
nb.fit(X_train, Y_train)

pred_y_train = nb.predict(X_train)
pred_y_test = nb.predict(X_validation)

accuracy_train = accuracy_score(Y_train, pred_y_train)
accuracy_test = accuracy_score(Y_validation, pred_y_test)
print("Accuracy training :" + str(accuracy_train))
print("Accuracy testing :" + str(accuracy_test))
print("Confusion Matriks Data Training :\n" + str(sklearn.metrics.confusion_matrix(Y_train, pred_y_train)))
print("Confusion Matriks Data Testing :\n" + str(sklearn.metrics.confusion_matrix(Y_validation, pred_y_test)))

Accuracy training :0.8719646799116998
Accuracy testing :0.8018867924528302
Confusion Matriks Data Training :
[[1175  196]
 [ 152 1195]]
Confusion Matriks Data Testing :
[[452 119]
 [112 483]]


#### Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

In [34]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)
y_train_pred = logreg.predict(X_train)
y_val_pred = logreg.predict(X_validation)
print("Akurasi Data Training : " + str(logreg.score(X_train, Y_train)))
print("Akurasi Data Testing : " + str(logreg.score(X_validation, Y_validation)))
print("Confusion Matriks Data Training :\n" + str(sklearn.metrics.confusion_matrix(Y_train, y_train_pred)))
print("Confusion Matriks Data Testing :\n" + str(sklearn.metrics.confusion_matrix(Y_validation, y_val_pred)))

Akurasi Data Training : 0.9521707137601177
Akurasi Data Testing : 0.7770154373927959
Confusion Matriks Data Training :
[[1291   80]
 [  50 1297]]
Confusion Matriks Data Testing :
[[443 128]
 [132 463]]


#### Random Forest

In [0]:
from sklearn.ensemble import RandomForestClassifier
randomforest = RandomForestClassifier(random_state=seed)
randomforest.fit(X_train, Y_train)

pred_y_train = randomforest.predict(X_train)
pred_y_test = randomforest.predict(X_validation)

accuracy_train = accuracy_score(Y_train, pred_y_train)
accuracy_test = accuracy_score(Y_validation, pred_y_test)
print("Accuracy training :" + str(accuracy_train))
print("Accuracy testing :" + str(accuracy_test))
print("Confusion Matriks Data Training :\n" + str(sklearn.metrics.confusion_matrix(Y_train, y_train_pred)))
print("Confusion Matriks Data Testing :\n" + str(sklearn.metrics.confusion_matrix(Y_validation, y_val_pred)))

#### Decision Tree

In [0]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)

pred_y_train = dt.predict(X_train)
pred_y_test = dt.predict(X_validation)

accuracy_train = accuracy_score(Y_train, pred_y_train)
accuracy_test = accuracy_score(Y_validation, pred_y_test)
print("Accuracy training :" + str(accuracy_train))
print("Accuracy testing :" + str(accuracy_test))
print("Confusion Matriks Data Training :\n" + str(sklearn.metrics.confusion_matrix(Y_train, y_train_pred)))
print("Confusion Matriks Data Testing :\n" + str(sklearn.metrics.confusion_matrix(Y_validation, y_val_pred)))

#### SVM

##### Linear Kernel

In [35]:
from sklearn.svm import SVC
svc = SVC(kernel="linear")
svc.fit(X_train, Y_train)
pred_y_train = svc.predict(X_train)
pred_y_test = svc.predict(X_validation)

accuracy_train = accuracy_score(Y_train, pred_y_train)
accuracy_test = accuracy_score(Y_validation, pred_y_test)
print("Accuracy training :" + str(accuracy_train))
print("Accuracy testing :" + str(accuracy_test))
print("Confusion Matriks Data Training :\n" + str(sklearn.metrics.confusion_matrix(Y_train, y_train_pred)))
print("Confusion Matriks Data Testing :\n" + str(sklearn.metrics.confusion_matrix(Y_validation, y_val_pred)))

Accuracy training :0.9738778513612951
Accuracy testing :0.7392795883361921
Confusion Matriks Data Training :
[[1291   80]
 [  50 1297]]
Confusion Matriks Data Testing :
[[443 128]
 [132 463]]


##### Radial Basis Function (RBF) Kernel

In [0]:
from sklearn.svm import SVC
svc_rbf = SVC()
svc_rbf.fit(X_train, Y_train)
pred_y_train = svc_rbf.predict(X_train)
pred_y_test = svc_rbf.predict(X_validation)

accuracy_train = accuracy_score(Y_train, pred_y_train)
accuracy_test = accuracy_score(Y_validation, pred_y_test)
print("Accuracy training :" + str(accuracy_train))
print("Accuracy testing :" + str(accuracy_test))
print("Confusion Matriks Data Training :\n" + str(sklearn.metrics.confusion_matrix(Y_train, y_train_pred)))
print("Confusion Matriks Data Testing :\n" + str(sklearn.metrics.confusion_matrix(Y_validation, y_val_pred)))

##### Polynomial Kernel

In [0]:
from sklearn.svm import SVC
svc_poly = SVC(kernel='poly')
svc_poly.fit(X_train, Y_train)
pred_y_train = svc_poly.predict(X_train)
pred_y_test = svc_poly.predict(X_validation)

accuracy_train = accuracy_score(Y_train, pred_y_train)
accuracy_test = accuracy_score(Y_validation, pred_y_test)
print("Accuracy training :" + str(accuracy_train))
print("Accuracy testing :" + str(accuracy_test))
print("Confusion Matriks Data Training :\n" + str(sklearn.metrics.confusion_matrix(Y_train, y_train_pred)))
print("Confusion Matriks Data Testing :\n" + str(sklearn.metrics.confusion_matrix(Y_validation, y_val_pred)))

##### Sigmoid Kernel

In [0]:
from sklearn.svm import SVC
svc_sig = SVC(kernel='sigmoid')
svc_sig.fit(X_train, Y_train)
pred_y_train = svc_sig.predict(X_train)
pred_y_test = svc_sig.predict(X_validation)

accuracy_train = accuracy_score(Y_train, pred_y_train)
accuracy_test = accuracy_score(Y_validation, pred_y_test)
print("Accuracy training :" + str(accuracy_train))
print("Accuracy testing :" + str(accuracy_test))
print("Confusion Matriks Data Training :\n" + str(sklearn.metrics.confusion_matrix(Y_train, y_train_pred)))
print("Confusion Matriks Data Testing :\n" + str(sklearn.metrics.confusion_matrix(Y_validation, y_val_pred)))

#### Testing with real data (data_test)

In [22]:
import pickle

In [36]:
saved_model = pickle.dumps(svc)

In [39]:
# Load the pickled model
classifier_from_pickle = pickle.loads(saved_model)
# Use the loaded pickled model to make predictions of data_test
x_test_pred = classifier_from_pickle.predict(X_test_cv)
#accuracy
result = classifier_from_pickle.score(X_test_cv, test_y)
print(result)

ValueError: X.shape[1] = 8411 should be equal to 5834, the number of features at training time

### Word Cloud Representation

In [0]:
positif = train_sample["filtered.reviews"][train_sample["reviews.sentiment"]=='positive'] 
negatif = train_sample["filtered.reviews"][train_sample["reviews.sentiment"]=='negative'] 

In [0]:
from wordcloud import WordCloud 
import matplotlib.pyplot as plt 
wordcloud = WordCloud(random_state=seed).generate(" ".join(positif)) 
plt.figure(figsize = (15,7)) 
plt.imshow(wordcloud,interpolation = 'bilinear') 
plt.axis("off") 
plt.show()

In [0]:
from wordcloud import WordCloud 
import matplotlib.pyplot as plt 
wordcloud = WordCloud(random_state=seed).generate(" ".join(negatif)) 
plt.figure(figsize = (15,7)) 
plt.imshow(wordcloud,interpolation = 'bilinear') 
plt.axis("off") 
plt.show()

## Lemmatization + Remove Punctuation + Without Removing Stopwords 

In [0]:
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer
import re
from nltk.corpus import stopwords

ps = PorterStemmer()
lem = WordNetLemmatizer()

In [0]:
filtered = train_sample['reviews.text']
clean = []
for i in filtered:
    clean_str = re.sub(r'[^\w\s]',' ', i) #punctual removal
    clean_str = re.sub('\s+', ' ', clean_str) #remove extra space
    words = word_tokenize(clean_str)
    for w in range(len(words)):
        #print(ps.stem(words[w]))
        words[w] = lem.lemmatize(words[w])
    sentence = " ".join(words)

    clean.append(sentence)
clean

In [0]:
train_sample['stemmed'] = clean

In [0]:
train_sample

In [0]:
positive = train_sample["stemmed"][train_sample["reviews.sentiment"]=='positive'] 
negative = train_sample["stemmed"][train_sample["reviews.sentiment"]=='negative'] 

In [0]:
from wordcloud import WordCloud 
import matplotlib.pyplot as plt 
wordcloud = WordCloud(random_state=21).generate(" ".join(positive)) 
plt.figure(figsize = (15,7)) 
plt.imshow(wordcloud,interpolation = 'bilinear') 
plt.axis("off") 
plt.show()

In [0]:
from wordcloud import WordCloud 
import matplotlib.pyplot as plt 
wordcloud = WordCloud(random_state=21).generate(" ".join(negative)) 
plt.figure(figsize = (15,7)) 
plt.imshow(wordcloud,interpolation = 'bilinear') 
plt.axis("off") 
plt.show()

In [0]:
test = data_test_w_sent.sample(n=3884, random_state=1234)

In [0]:
train_x = train_sample['stemmed']
train_y = train_sample['reviews.sentiment']

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(analyzer='word', ngram_range=(1,1))

In [0]:
#count_vect.fit(train_x) 
train_x_feat = count_vect.transform(train_x)
train_x_feat.shape

In [0]:
test_x = test['filtered_reviews']
test_y = test['reviews.sentiment']

In [0]:
count_vect.fit(test_x) 
test_x_feat = count_vect.fit_transform(test_x) 
test_x_feat.shape

In [0]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support, classification_report 
from sklearn.svm import SVC
svc = SVC(kernel="linear")
svc.fit(train_x_feat, train_y)
pred_y_train = svc.predict(train_x_feat)
pred_y_test = svc.predict(test_x_feat)

accuracy_train = accuracy_score(train_y, pred_y_train)
accuracy_test = accuracy_score(test_y, pred_y_test) 
print("Accuracy training :" + str(accuracy_train)) 
print("Accuracy testing :" + str(accuracy_test))
cnf_matrix = confusion_matrix(test_y, pred_y_test)
cnf_matrix

In [0]:
from sklearn import naive_bayes
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support, classification_report 
classifier = naive_bayes.MultinomialNB()
classifier.fit(train_x_feat, train_y)

pred_y_train = classifier.predict(train_x_feat)
pred_y_test = classifier.predict(test_x_feat)

#print(pred_y[0:30]) 
#print(test_y[0:30]) 
accuracy_train = accuracy_score(train_y, pred_y_train)
accuracy_test = accuracy_score(test_y, pred_y_test) 
print("Accuracy training :" + str(accuracy_train)) 
print("Accuracy testing :" + str(accuracy_test))
cnf_matrix = confusion_matrix(test_y, pred_y_test)
cnf_matrix

## Lemmatization + Remove Stopwords + Remove Punctuation

In [0]:
filtered = train_sample['filtered_reviews']
clean = []
for i in filtered:
    clean_str = re.sub(r'[^\w\s]',' ', i) #punctual removal
    clean_str = re.sub('\s+', ' ', clean_str) #remove extra space
    words = word_tokenize(clean_str)
    for w in range(len(words)):
        words[w] = lem.lemmatize(words[w])
    sentence = " ".join(words)

    clean.append(sentence)
clean

In [0]:
train_sample['stemmed'] = clean

In [0]:
positive = train_sample["stemmed"][train_sample["reviews.sentiment"]=='positive'] 
negative = train_sample["stemmed"][train_sample["reviews.sentiment"]=='negative'] 

In [0]:
from wordcloud import WordCloud 
import matplotlib.pyplot as plt 
wordcloud = WordCloud(random_state=21).generate(" ".join(positive)) 
plt.figure(figsize = (15,7)) 
plt.imshow(wordcloud,interpolation = 'bilinear') 
plt.axis("off") 
plt.show()

In [0]:
from wordcloud import WordCloud 
import matplotlib.pyplot as plt 
wordcloud = WordCloud(random_state=21).generate(" ".join(negative)) 
plt.figure(figsize = (15,7)) 
plt.imshow(wordcloud,interpolation = 'bilinear') 
plt.axis("off") 
plt.show()