In [124]:
import json

with open('./json/json_translated.json') as data_file:
    dataTranslatedNormal = json.load(data_file)

In [138]:
len(dataTranslatedNormal)

22445

In [125]:
# load right dataset
dataIteration = dataTranslatedNormal

def filterNonEnglishOut(dataIteration):
    dataIteration_preprocess = []

    words_order = dataIteration
    only_english_words = []

    # to keep only english words
    with open("./wordlist_en.txt") as word_file:
        only_english_words = set(word.strip().lower() for word in word_file)

    for line in words_order:
        english_line = ""
        for w in line.split():
            if w.strip().lower() in english_words_file:
                english_line += str(w.strip().lower()) + " "
            else: 
                pass
        dataIteration_preprocess.append(english_line)    
    return dataIteration_preprocess

In [101]:
print (dataIteration[0].get("plus"))
print (filterNonEnglishOut(dataIteration=dataIteration[0].get("plus")))

['exceeded expectations, great as the first microscope for the youngest children, a gift for a niece four years - needed assistance to adults, older children 5-6 years old will use themselves']
['great as the first microscope for the a gift for a niece four years needed assistance to older children years old will use themselves ']


In [126]:
plusWords = []
minusWords = []

words = []

labels = []

#every review
for x in range(0,len(dataIteration)):
    
    for plus in  dataIteration[x].get("plus"):
        plusWords.append(plus) # only for informative purpose
        
        words.append(plus)
        labels.append(1)

    for minus in dataIteration[x].get("minus"):
        minusWords.append(minus) # only for informative purpose
        
        words.append(minus)
        labels.append(0)

In [127]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(words, labels, test_size=0.2, random_state=0)

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()

words_train_count = count_vect.fit_transform(X_train)

In [137]:
freqs = [(word, words_train_count.getcol(idx).sum()) for word, idx in count_vect.vocabulary_.items()]
#sort from largest to smallest
print (sorted (freqs, key = lambda x: -x[1])[:10])

[('the', 7260), ('and', 4307), ('of', 3483), ('not', 3398), ('to', 2980), ('price', 2480), ('it', 2375), ('is', 2308), ('nothing', 2182), ('for', 1987)]


In [128]:
from sklearn.feature_extraction.text import TfidfTransformer

tf_transformer = TfidfTransformer(use_idf=False).fit(words_train_count)
words_train_tf = tf_transformer.transform(words_train_count)

In [129]:
tfidf_transformer = TfidfTransformer()
words_train_tfidf = tfidf_transformer.fit_transform(words_train_tf)

In [130]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(words_train_tfidf, y_train)

In [131]:
docs_new = ['clever tool', 'bad experience', 'very poor instruction']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
     print('%r => %s' % (doc, category))

'clever tool' => 1
'bad experience' => 0
'very poor instruction' => 0


In [132]:
docs_new = X_test
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

print("Score " + str(clf.score(X_new_tfidf, y_test)))

from sklearn.metrics import f1_score
predicted = clf.predict(X_new_tfidf)
# F1 = 2 * (precision * recall) / (precision + recall) 
print (f1_score(y_pred=predicted, y_true=y_test, average="binary"))
print (f1_score(y_pred=predicted, y_true=y_test, average="macro"))
print (f1_score(y_pred=predicted, y_true=y_test, average="micro"))
print (f1_score(y_pred=predicted, y_true=y_test, average="weighted"))
print (f1_score(y_pred=predicted, y_true=y_test, average=None))

Score 0.875124448869
0.911971124925
0.848598086924
0.875124448869
0.870870135978
[ 0.78522505  0.91197112]


In [133]:
from sklearn.metrics import roc_auc_score

predicted_proba = clf.predict_proba(X_new_tfidf)
# Compute Area Under the Curve (AUC) from prediction scores
print ("ROC :" + str(roc_auc_score(y_test, predicted_proba[:, 1])))

ROC :0.948830070862


In [134]:
from __future__ import print_function

from pprint import pprint
from time import time
import logging

from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

# Sample pipeline for text feature extraction and evaluation
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

with open('./english_stop_words.txt') as stop_words:
    stop_word_list = list([f.strip() for f in stop_words])

# uncommenting more parameters will give better exploring power but will
# increase processing time in a combinatorial way
parameters = {
    'vect__max_df': (0.5, 0.75, 1.0),
    'vect__stop_words' : (stop_word_list ,'english', None),
    'vect__max_features': (None, 5000, 10000, 50000),
    'vect__ngram_range': ((1, 1), (1, 2)),  # unigrams or bigrams
    'vect__analyzer': ('word', 'char', 'char_wb') ,
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    #'clf__penalty': ('l2', 'elasticnet'),
    #'clf__n_iter': (10, 50, 80),
}

# multiprocessing requires the fork to happen in a __main__ protected
# block

# find the best parameters for both the feature extraction and the
# classifier
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
grid_search.fit(X_test, y_test)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'tfidf__norm': ('l1', 'l2'),
 'tfidf__use_idf': (True, False),
 'vect__analyzer': ('word', 'char', 'char_wb'),
 'vect__max_df': (0.5, 0.75, 1.0),
 'vect__max_features': (None, 5000, 10000, 50000),
 'vect__ngram_range': ((1, 1), (1, 2)),
 'vect__stop_words': (['a',
                       'about',
                       'above',
                       'after',
                       'again',
                       'against',
                       'all',
                       'am',
                       'an',
                       'and',
                       'any',
                       'are',
                       "aren't",
                       'as',
                       'at',
                       'be',
                       'because',
                       'been',
                       'before',
                       'being',
                       'below',
                       'between',
     

[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    5.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   25.1s
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 792 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done 1242 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 1792 tasks      | elapsed:  8.5min
[Parallel(n_jobs=-1)]: Done 2442 tasks      | elapsed: 11.5min
[Parallel(n_jobs=-1)]: Done 2592 out of 2592 | elapsed: 12.4min finished


done in 744.044s

Best score: 0.861
Best parameters set:
	tfidf__norm: 'l2'
	tfidf__use_idf: True
	vect__analyzer: 'word'
	vect__max_df: 0.5
	vect__max_features: 5000
	vect__ngram_range: (1, 2)
	vect__stop_words: None
