In [1]:
import pandas as pd
import nltk
import sys
from nltk.corpus import stopwords

In [2]:
data_train = pd.read_csv('data_train.csv', encoding='latin-1')

In [3]:
#REMOVE STOPWORDS
stop_words = set(stopwords.words('english')) 

In [4]:
# define rejected stopwords bc it has sentiment info
rejected_sw = ["aren't", "couldn", "couldn't", "didn", "didn't", "doesn", "doesn't", "don't", "hadn", "hadn't", "hasn", "hasn't", "haven't", "isn", "isn't", "mightn", "mightn't", "mustn", "mustn't", "needn", "needn't", "no", "nor", "not", "shan't", "shouldn", "shouldn't", "wasn", "wasn't", "weren't", "won't", "wouldn", "wouldn't",  ]

In [5]:
acc_stopwords = []
for i, value in enumerate(stop_words):
    if value not in rejected_sw:
        acc_stopwords.append(value)

In [6]:
# remove stopwords
filtered_sentences = []
for i,words in enumerate(data_train['reviews.text']):
    filtered = [word for word in words.split() if word not in acc_stopwords]
    filtered_sentences.append(' '.join(filtered))

In [7]:
# add new column to store the sentences which stopwords has been removed
data_train['filtered_reviews'] = pd.Series(filtered_sentences, index=data_train.index)

In [None]:
#data_train.to_csv('filtered_train.csv', index=False)

In [None]:
data_test = pd.read_csv('data_test.csv',  encoding='latin-1')

In [None]:
# remove stopwords
filtered_sentences_test = []
for i,words in enumerate(data_test['reviews.text']):
    filtered = [word for word in words.split() if word not in acc_stopwords]
    filtered_sentences_test.append(' '.join(filtered))

In [None]:
# add new column to store the sentences which stopwords has been removed
data_test['filtered_reviews'] = pd.Series(filtered_sentences_test, index=data_test.index)
data_test.shape

In [None]:
#data_test.to_csv('filtered_test.csv', index=False)

In [None]:
data_test_w_sent = pd.read_csv('data_test_with_sentiment.csv',  encoding='latin-1')

In [None]:
# remove stopwords
filtered_sentences_test_sent = []
for i,words in enumerate(data_test_w_sent['reviews.text']):
    filtered = [word for word in words.split() if word not in acc_stopwords]
    filtered_sentences_test_sent.append(' '.join(filtered))

In [None]:
# add new column to store the sentences which stopwords has been removed
data_test_w_sent['filtered_reviews'] = pd.Series(filtered_sentences_test_sent, index=data_test_w_sent.index)

In [None]:
#data_test_w_sent.to_csv('filtered_test_w_sentiment.csv', index=False)

In [8]:
# STEMMING
from nltk.stem.snowball import SnowballStemmer

In [9]:
stemmer = SnowballStemmer("english")

In [10]:
stemmed_train = []
for words in data_train['filtered_reviews']:
    #store each words that have been stemmed in an array of stems
    stems = []
    for word in words.split():
        stem = stemmer.stem(word)
        stems.append(stem)
    #store each sentences from stems in an array of stemmed_train
    stemmed_train.append(' '.join(stems))
stemmed_train

['this product far not disappointed. my children love use i like abil monitor control content see ease.',
 'great beginn experienc person. bought gift love',
 'inexpens tablet use learn on, step nabi. he thrill it, learn skype already...',
 "i'v fire hd 8 two week i love it. this tablet great value.w prime member tablet shines. i love abl easili access prime content well movi download watch laterthi 1280/800 screen realli nice look nice crisp bright infact brighter ipad pro cost $900 base model. the build fire insan awesom run 7.7mm thick smooth glossi feel back realli amaz hold like futurist tab ur hands.",
 'i bought grand daughter come visit. i set user, enter age name amazon make sure access site content appropri age. simpl love capabilities. i also bought instal 64gig sd card give littl tablet plenti storage. for price i think tablet best one there. you spend hundr dollar addit speed capac come basic tablet everyth peopl ever need fraction cost.',
 'this amazon fire 8 inch tablet 

In [11]:
data_train['stemmed_reviews'] = pd.Series(stemmed_train, index=data_train.index)

In [None]:
#data_train.to_csv('stem_train.csv', index=False)

In [None]:
stemmed_test = []
for words in data_test['filtered_reviews']:
    #store each words that have been stemmed in an array of stems
    stems = []
    for word in words.split():
        stem = stemmer.stem(word)
        stems.append(stem)
    #store each sentences from stems in an array of stemmed_train
    stemmed_test.append(' '.join(stems))
stemmed_test

In [None]:
data_test['stemmed_reviews'] = pd.Series(stemmed_test, index=data_test.index)

In [None]:
#data_test.to_csv('stem_test.csv', index=False)

In [None]:
stemmed_test_sent = []
for words in data_test_w_sent['filtered_reviews']:
    #store each words that have been stemmed in an array of stems
    stems = []
    for word in words.split():
        stem = stemmer.stem(word)
        stems.append(stem)
    #store each sentences from stems in an array of stemmed_train
    stemmed_test_sent.append(' '.join(stems))
stemmed_test_sent

In [None]:
data_test_w_sent['stemmed_reviews'] = pd.Series(stemmed_test_sent, index=data_test_w_sent.index)

In [None]:
#data_test_w_sent.to_csv('stem_test_w_sent.csv', index=False)

In [None]:
# binary sentiment. 0 for - 1 for +
bin_sent = []
for i in range(len(data_train)):
    bs = []
    if data_train['reviews.sentiment'][i] == 'negative':
        bin_sent.append(0)
    else:
        bin_sent.append(1)
        
        
# store binary sentiments to df        
data_train['binary_sentiment'] = pd.Series(bin_sent, index=data_train.index)

In [12]:
#sampling bc data isn't balanced
train_sample = data_train.loc[data_train['reviews.sentiment'] == 'negative'].sample(n=1942, random_state=1234)

In [13]:
train_sample_positive = data_train.loc[data_train['reviews.sentiment'] == 'positive'].sample(n=1942, random_state=1234)
train_sample = train_sample.append(train_sample_positive)
train_sample.shape

(3884, 6)

In [None]:
# FEATURE EXTRACTION 1ST APPROACH TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
X = vectorizer.fit_transform(train_sample['stemmed_reviews'])

In [None]:
X_test = vectorizer.transform(data_test_w_sent['filtered_reviews'])

In [None]:
##2nd Approach of FEATURE EXTRACTION USING CountVectorizer

In [14]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
cv = CountVectorizer(binary=False)

In [16]:
cv.fit(train_sample['stemmed_reviews'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [17]:
X_cv = cv.transform(train_sample['stemmed_reviews'])

In [None]:
X_test_cv = cv.transform(data_test_w_sent['filtered_reviews'])

In [18]:
# TRAINING
from sklearn.linear_model import LogisticRegression

In [19]:
import sklearn.metrics

In [20]:
from sklearn.metrics import accuracy_score

In [21]:
from sklearn.model_selection import train_test_split

In [22]:
validation_size = 0.30 
seed = 1234 #generate same sample
Y = train_sample['reviews.sentiment']
X_train, X_validation, Y_train, Y_validation = train_test_split(X_cv, Y, test_size=validation_size, random_state=seed)

In [23]:
logreg = LogisticRegression()
logreg.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [24]:
y_train_pred = logreg.predict(X_train)
y_val_pred = logreg.predict(X_validation)

In [25]:
print("Akurasi Data Training : " + str(logreg.score(X_train, Y_train)))
print("Akurasi Data Testing : " + str(logreg.score(X_validation, Y_validation)))
print("Confusion Matriks Data Training :\n" + str(sklearn.metrics.confusion_matrix(Y_train, y_train_pred)))
print("Confusion Matriks Data Testing :\n" + str(sklearn.metrics.confusion_matrix(Y_validation, y_val_pred)))

Akurasi Data Training : 0.9484915378955114
Akurasi Data Testing : 0.7804459691252144
Confusion Matriks Data Training :
[[1292   79]
 [  61 1286]]
Confusion Matriks Data Testing :
[[443 128]
 [128 467]]


In [None]:
import pickle

In [None]:
saved_model = pickle.dumps(logreg)

In [None]:
# Load the pickled model
classifier_from_pickle = pickle.loads(saved_model)

In [None]:
# Use the loaded pickled model to make predictions
x_cv_pred = classifier_from_pickle.predict(X_cv)

In [None]:
#accuracy of x_cv
result = classifier_from_pickle.score(X_cv, Y)
print(result)

In [None]:
Y_test = data_test_w_sent['reviews.sentiment']

In [None]:
# Use the loaded pickled model to make predictions of data_test 
x_test_pred = classifier_from_pickle.predict(X_test)
#accuracy of x_cv
result = classifier_from_pickle.score(X_test, Y_test)
print(result)

In [None]:
# Use the loaded pickled model to make predictions of data_test 
x_test_pred = classifier_from_pickle.predict(X_test_cv)
#accuracy of x_cv
result = classifier_from_pickle.score(X_test_cv, Y_test)
print(result)