# Yelp Reviews

### Load data

In [31]:
import json
def _load_data(filename):
    with open(filename) as f:
        reviews = f.read().strip().split('\n')
    reviews = [json.loads(review) for review in reviews]
    return reviews

def _get_texts(reviews):
    texts = [review['text'] for review in reviews]
    return texts

def get_data_and_labels(filepath):
    reviews = _load_data(filepath)
    texts = _get_texts(reviews)

    binstars = [0 if review['stars'] <= 3 else 1 for review in reviews]
    balanced_texts = []
    balanced_labels = []
    limit = 100000  # Change this to grow/shrink the dataset
    neg_pos_counts = [0, 0]
    for i in range(len(texts)):
        polarity = binstars[i]
        if neg_pos_counts[polarity] < limit:
            balanced_texts.append(texts[i])
            balanced_labels.append(binstars[i])
            neg_pos_counts[polarity] += 1

    return balanced_texts, balanced_labels, None

In [59]:
texts, labels, _ = get_data_and_labels('../../email-classification/data/yelp/dataset/reviews_400K.json')

In [60]:
from sklearn.model_selection import train_test_split

data_train, data_val, labels_train, labels_val = train_test_split(texts, labels, random_state=0)

### Train model

In [61]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', RandomForestClassifier())
                    ])
text_clf = text_clf.fit(data_train, labels_train)

### Evaluate model

In [62]:
from sklearn.metrics import precision_score, recall_score
y_pred = text_clf.predict(data_val)
y_true = labels_val
p = precision_score(y_true, y_pred)
r = recall_score(y_true, y_pred)
print('precision score: {}'.format(p))
print('recall score: {}'.format(r))

precision score: 0.820450885668277
recall score: 0.6928091505359143


In [63]:
text_clf.predict(["No no no no no "])

array([0])

In [65]:
import pickle
pickle.dump(text_clf, open('./model.pkl', 'wb'))