#Instructions for feature development
* [Load up dependencies](#Load-dependencies)
* [Load the matrices and vectors from temp file](#Load-the-matrices-and-vectors)
* [Add your own feature(s)](#Featurization)

# To-do:
* Better+more feature engineering (check)
* Try different ML models (check)
* Clean up code, function definitions, naming
* ONLY AT VERY END: run model on `test_set` and `test_set_labels`

#Load dependencies

In [1]:
import json
from pprint import pprint
import nltk, re
from nltk import word_tokenize
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')

#Parsing & Structuring Data

In [6]:
udata = []

with open('yelp_academic_dataset_user.json') as user_data:    
    for line in user_data:
        udata.append(json.loads(line))

In [7]:
# Remove friends so display isn't terrible (this user has a lot of friends)
udata[0]['friends'] = []
udata[0]

{'average_stars': 4.14,
 'compliments': {'cool': 78,
  'cute': 15,
  'funny': 11,
  'hot': 48,
  'more': 3,
  'note': 20,
  'photos': 15,
  'plain': 25,
  'profile': 8,
  'writer': 9},
 'elite': [2005, 2006],
 'fans': 69,
 'friends': [],
 'name': 'Russel',
 'review_count': 108,
 'type': 'user',
 'user_id': '18kPq7GPye-YQ3LyKyAZPw',
 'votes': {'cool': 245, 'funny': 166, 'useful': 278},
 'yelping_since': '2004-10'}

In [8]:
rdata = []
with open('yelp_academic_dataset_review.json') as review_data:
    for line in review_data:
        rdata.append(json.loads(line))

In [9]:
rdata[0]

{'business_id': 'vcNAWiLM4dR7D2nwwJ7nCA',
 'date': '2007-05-17',
 'review_id': '15SdjuK7DmYqUAj6rjGowg',
 'stars': 5,
 'text': "dr. goldberg offers everything i look for in a general practitioner.  he's nice and easy to talk to without being patronizing; he's always on time in seeing his patients; he's affiliated with a top-notch hospital (nyu) which my parents have explained to me is very important in case something happens and you need surgery; and you can get referrals to see specialists without having to see him first.  really, what more do you need?  i'm sitting here trying to think of any complaints i have about him, but i'm really drawing a blank.",
 'type': 'review',
 'user_id': 'Xqd0DzHaiyRqVH3WRG7hzg',
 'votes': {'cool': 1, 'funny': 0, 'useful': 2}}

In [10]:
bdata = []
with open('yelp_academic_dataset_business.json') as business_data:
    for line in business_data:
        bdata.append(json.loads(line))

In [11]:
bdata[0]

{'attributes': {'By Appointment Only': True},
 'business_id': 'vcNAWiLM4dR7D2nwwJ7nCA',
 'categories': ['Doctors', 'Health & Medical'],
 'city': 'Phoenix',
 'full_address': '4840 E Indian School Rd\nSte 101\nPhoenix, AZ 85018',
 'hours': {'Friday': {'close': '17:00', 'open': '08:00'},
  'Monday': {'close': '17:00', 'open': '08:00'},
  'Thursday': {'close': '17:00', 'open': '08:00'},
  'Tuesday': {'close': '17:00', 'open': '08:00'},
  'Wednesday': {'close': '17:00', 'open': '08:00'}},
 'latitude': 33.499313,
 'longitude': -111.983758,
 'name': 'Eric Goldberg, MD',
 'neighborhoods': [],
 'open': True,
 'review_count': 9,
 'stars': 3.5,
 'state': 'AZ',
 'type': 'business'}

In [12]:
def businesses_set(bdata):
    biz_set = set([])
    US_states = ['AZ', 'NV', 'WI', 'IL', 'NC', 'PA']
    
    for biz in bdata:
        if biz['state'] in US_states:
            biz_set.add(biz['business_id'])
    return biz_set

In [13]:
biz_set = businesses_set(bdata)

In [16]:
def reviews_hash(rdata, biz_set):
    reviews_dict = {}
    
    for review in rdata:
        if review['business_id'] in biz_set:
            user_id = review['user_id']
            year = int(review['date'][:4])
            if year != 2015:
                if (user_id, year) in reviews_dict:
                    reviews_dict[(user_id, year)].append(review)
                else:
                    reviews_dict[(user_id, year)] = [review]
    return reviews_dict

In [17]:
reviews_dict = reviews_hash(rdata, biz_set)

In [18]:
def elite_review_years_hash(udata):
    elite_review_years_dict = {}
    
    udata_subset = [user for user in udata if user['review_count']>=20]
    for user in udata_subset:
        user_id = user['user_id']
        elite_years = user['elite']
        elite_review_years_dict[user_id] = [year-1 for year in elite_years]
    return elite_review_years_dict

In [19]:
elite_review_years_dict = elite_review_years_hash(udata)

In [20]:
# returns 2 arrays
# 1) each entry is array of all of a given user's reviews in a given year
# 2) each entry is corresponding elite (1) or non-elite (0) status in next year
# i.e. whether reviews are "elite-worthy"
def final_raw_data_and_vector(reviews_dict, elite_review_years_dict):
    user_years_in_review = []
    elite_labels = []
    
    test_user_years_in_review = []
    test_elite_labels = []

    for id_year_tuple, reviews in reviews_dict.items():
        user_id = id_year_tuple[0]
        year = id_year_tuple[1]
        
        if year == 2014:
            if user_id in elite_review_years_dict.keys():        
                test_user_years_in_review.append(reviews)

                if year in elite_review_years_dict[user_id]:
                    test_elite_labels.append(1)
                else:
                    test_elite_labels.append(0)                
        else:
            if user_id in elite_review_years_dict.keys():        
                user_years_in_review.append(reviews)

                if year in elite_review_years_dict[user_id]:
                    elite_labels.append(1)
                else:
                    elite_labels.append(0)
    return user_years_in_review, elite_labels, test_user_years_in_review, test_elite_labels

In [21]:
% time train_dev_set, train_dev_labels, test_set, test_labels = final_raw_data_and_vector(reviews_dict, elite_review_years_dict)

Wall time: 2.76 s


In [22]:
assert len(train_dev_set) == len(train_dev_labels)
train_dev_set_len = len(train_dev_labels)
print(train_dev_set_len)

119469


In [23]:
assert len(test_set) == len(test_labels)
test_set_len = len(test_labels)
print(test_set_len)

34770


In [24]:
total_len = train_dev_set_len + test_set_len
print("Proportion of train/dev set to test set: "+str(train_dev_set_len/total_len))

Proportion of train/dev set to test set: 0.7745706338863712


In [26]:
np_train_dev_set = np.array(train_dev_set)
np_train_dev_labels = np.array(train_dev_labels)

In [27]:
rand_indices = np.random.permutation(len(train_dev_labels))
shuffled_train_dev_set = np_train_dev_set[rand_indices]
shuffled_train_dev_labels = np_train_dev_labels[rand_indices]

In [28]:
subset_length = len(shuffled_train_dev_set)
# 80-20 between training, and dev (within training+dev)
train_set = shuffled_train_dev_set[:subset_length*4//5]
train_set_labels = shuffled_train_dev_labels[:subset_length*4//5]

dev_set = shuffled_train_dev_set[subset_length*4//5:]
dev_set_labels = shuffled_train_dev_labels[subset_length*4//5:]

In [11]:
def reviews_to_text(reviews_arr):
    # Produce just the text of reviews
    reviews_text = []
    for review in reviews_arr:
        reviews_text.append(review['text'])
    return reviews_text

def tokenize_text(reviews_text):
    # Returns list of reviews => list of sentences => list of words
    tokenized_reviews = []
    for review in reviews_text:
        raw_sents = sent_tokenizer.tokenize(review)
        tokenized_sents = [nltk.word_tokenize(word) for word in raw_sents]
        tokenized_reviews.append(tokenized_sents)
    return tokenized_reviews

def tokenize_dataset(dataset):
    tokenized_dataset = []
    
    for reviews_arr in dataset:
        tokenized_reviews = tokenize_text(reviews_to_text(reviews_arr))
        tokenized_dataset.append(tokenized_reviews)
    return tokenized_dataset

In [31]:
tokenized_train_set = tokenize_dataset(train_set)
tokenized_dev_set   = tokenize_dataset(dev_set)

###Save this matrix and vector to a temp file, so info is loadable quickly

In [32]:
data_file = open('data_sets2.npz', 'wb+')
np.savez(data_file, train_set, train_set_labels, dev_set, dev_set_labels, tokenized_train_set, tokenized_dev_set, test_set, test_labels)
data_file.close()

#Load the matrices and vectors

In [2]:
# Remember to import dependencies first
load_data = np.load('data_sets.npz')

In [3]:
# arrays in order are
# train_set, train_set_labels, dev_set, dev_set_labels, tokenized_train_set, tokenized_dev_set, test_set, test_labels
load_data.files

['arr_7', 'arr_1', 'arr_2', 'arr_4', 'arr_5', 'arr_0', 'arr_3', 'arr_6']

In [4]:
train_set = load_data['arr_0']
train_set_labels = load_data['arr_1']
dev_set = load_data['arr_2']
dev_set_labels = load_data['arr_3']
tokenized_train_set = load_data['arr_4']
tokenized_dev_set = load_data['arr_5']
test_set = load_data['arr_6']
test_labels = load_data['arr_7']

# Featurization

Notes before you begin:
* Remember to step through each feature definition below
* Then define your feature function, if necessary
* Set your feature to some variable in the featurize() function
* Make sure feature is appended in featurize(): `feature_entry.append(YOUR_FEATURE_VARIABLE)`

In [5]:
def total_reviews(reviews_arr):
    return len(reviews_arr)

def basic_totals(reviews_arr):
    basic_arr = []
    
    chars = 0
    paragraphs = 0
    cool_votes = 0
    funny_votes = 0
    useful_votes = 0
    
    for review in reviews_arr:
        chars += len(review['text'])
        paragraphs += review['text'].count("\n\n")
        cool_votes += review['votes']['cool']
        funny_votes += review['votes']['funny']
        useful_votes += review['votes']['useful']
    
    basic_arr.append(chars)
    basic_arr.append(paragraphs)
    basic_arr.append(cool_votes)
    basic_arr.append(funny_votes)
    basic_arr.append(useful_votes)
    
    return basic_arr

In [6]:
def get_NLP_features(tokenized_reviews):
    NLP_features = []
    
    total_sents = 0
    total_words = 0
    vocabulary = set([])
    
    for review in tokenized_reviews:
        for sent in review:
            total_sents += 1
            total_words += len(sent) - 1
            for word in sent:
                if word.lower() not in vocabulary:
                    vocabulary.add(word.lower())

    vocabulary_size = len(vocabulary)
    
    return [total_sents, total_words, vocabulary_size]

In [7]:
def featurize(dataset, tokenized_dataset):
    feature_array = []
    index = 0
    
    for reviews_arr in dataset:
        feature_entry = []

        review_count = total_reviews(reviews_arr)
        totals = basic_totals(reviews_arr)
        
        totals.extend(get_NLP_features(tokenized_dataset[index]))
        
        averages = [total/review_count for total in totals]

        feature_entry.append(review_count)
        feature_entry.extend(totals)
        feature_entry.extend(averages)
        
        feature_array.append(feature_entry)
        index += 1
    return feature_array

In [50]:
feature_dict = {
    1: "total reviews",
    2: "total characters",
    3: "total paragraphs",
    4: "total cool votes",
    5: "total funny votes",
    6: "total useful votes",
    7: "total sentences",
    8: "total words",
    9: "total size of vocabulary (unique words)",
    10: "chars per review",
    11: "paragraphs per review",
    12: "cool votes per review",
    13: "funny votes per review",
    14: "useful votes per review",
    15: "sentences per review",
    16: "words per review",
    17: "size of vocabulary per review"
}

In [8]:
% time featurized_train = featurize(train_set, tokenized_train_set)

Wall time: 1min 17s


In [9]:
% time featurized_dev = featurize(dev_set, tokenized_train_set)

Wall time: 18.1 s


In [12]:
tokenized_test_set = tokenize_dataset(test_set)

In [13]:
% time featurized_test = featurize(test_set, tokenized_test_set)

Wall time: 16.4 s


In [14]:
% time normalized_train = normalize(featurized_train, norm='l2')

Wall time: 322 ms


In [15]:
% time normalized_dev = normalize(featurized_dev, norm='l2')

Wall time: 71.6 ms


In [16]:
% time normalized_test = normalize(featurized_test, norm='l2')

Wall time: 119 ms


In [24]:
one_index = np.where(train_set_labels == 1)[0]
zero_index = np.where(train_set_labels == 0)[:len(one_index)][0]

subset_train = normalized_train[list(one_index)+list(zero_index)]
subset_labels = train_set_labels[list(one_index)+list(zero_index)]

In [12]:
# Making sure the percentage of elite users is reasonable
def get_elite_ratio(labels):
    elite = 0
    for item in labels:
        if item == 1:
            elite += 1
    return elite / len(labels)
print("training set percentage elite: " +str(get_elite_ratio(train_set_labels)))
print("development set percentage elite: " +str(get_elite_ratio(dev_set_labels)))

training set percentage elite: 0.23506147004969918
development set percentage elite: 0.23492236219813334


#Building the model

In [19]:
def get_model_stats(dev_set, dev_set_labels, clf):
    true_pos = 0
    false_pos = 0
    true_neg = 0
    false_neg = 0
    total = 0
    
    for element in dev_set:
        correct_label = dev_set_labels[total]
        
        if clf.predict(element) == correct_label:
            if correct_label == 0:
                true_neg += 1
            else:
                true_pos += 1
        else:
            if correct_label == 0:
                false_pos += 1
            else:
                false_neg += 1
        total += 1
        
    accuracy = (true_pos + true_neg) / total
    precision = true_pos / (true_pos + false_pos)
    recall = true_pos / (true_pos + false_neg)
    f1_score = 2 * ((precision * recall) / (precision + recall))
    
    print("T Positive: %s, F Positive: %s" % (true_pos, false_pos))
    print("F Negative: %s, T Negative: %s" % (false_neg, true_neg))
    print()
    print("The following metrics are on a scale of 0 to 1:")
    print("Model accuracy: "+str(accuracy))
    print("Model precision: "+str(precision))
    print("Model recall: "+str(recall))
    print("Model F1 Score: "+str(f1_score))

##Gaussian Naive Bayes Model

We can see that a naive bayes model has quite low accuracy as well as precision. Its accuracy is lower than always guessing non-elite, and its precision of 29.47% means that out of the times it guesses elite, it is only correct that percentage of the time, which is definitely not great.

###Normalized Features Attempt

In [113]:
gaussian_clf = GaussianNB()
% time gaussian_clf.fit(normalized_train, train_set_labels)

Wall time: 71.1 ms


GaussianNB()

In [116]:
% time get_model_stats(normalized_dev, dev_set_labels, gaussian_clf)

T Positive: 2296, F Positive: 5496
F Negative: 3317, T Negative: 12784

The following metrics are on a scale of 0 to 1:
Model accuracy: 0.6311471979240781
Model precision: 0.2946611909650924
Model recall: 0.40905041867094244
Model F1 Score: 0.34255874673629244
Wall time: 2.33 s


###Non-normalized Features Attempt

In [117]:
gaussian_clf = GaussianNB()
% time gaussian_clf.fit(featurized_train, train_set_labels)

Wall time: 393 ms


GaussianNB()

In [118]:
% time get_model_stats(featurized_dev, dev_set_labels, gaussian_clf)

T Positive: 1001, F Positive: 761
F Negative: 4612, T Negative: 17519

The following metrics are on a scale of 0 to 1:
Model accuracy: 0.7751224207927008
Model precision: 0.5681044267877412
Model recall: 0.17833600570105113
Model F1 Score: 0.2714576271186441
Wall time: 3.02 s


##SVM Model

###Non-normalized Features Attempt

An SVM-based model was prohibitivly slow to run and not particularly accurate, so we decided against using an SVM Model. The lack of wall time for the learning machine fitting was a result of us changing the featurization a little bit, which caused the machine to take over 45 minutes to run, at which point we just stopped the kernel and decided not to proceed with this model.

In [None]:
svm_clf = svm.SVC()
% time svm_clf.fit(featurized_train, train_set_labels)

In [25]:
% time get_model_accuracy(featurized_dev, dev_set_labels, svm_clf)

Model accuracy: 76.34453605658561 percent.
Wall time: 57.9 s


##Logistic Regression Model

###Subset Normalized Features Attempt

In [25]:
logreg_clf = linear_model.LogisticRegression(C=1)
% time logreg_clf.fit(subset_train, subset_labels)

Wall time: 626 ms


LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr',
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0)

We can see that a linear logistic regression model performs **very poorly**: it has an accuracy of 77.27 percent _when run on the data on which it was trained_. We know that the dev set has 0.2349 (23.49%) of its data labeled "non-elite", so by guessing "non-elite" every time, it will achieve 76.51% accuracy. As we see that it is a poor match when run on that data on which it was trained, we know then that this model must be lacking.

In [26]:
% time get_model_stats(subset_train, subset_labels, logreg_clf)

T Positive: 1206, F Positive: 456
F Negative: 21260, T Negative: 72653

The following metrics are on a scale of 0 to 1:
Model accuracy: 0.7727857703374313
Model precision: 0.7256317689530686
Model recall: 0.05368111813406926
Model F1 Score: 0.09996684350132626
Wall time: 4.63 s


In [27]:
% time get_model_stats(normalized_dev, dev_set_labels, logreg_clf)

T Positive: 396, F Positive: 270
F Negative: 5217, T Negative: 18010

The following metrics are on a scale of 0 to 1:
Model accuracy: 0.7703511488720546
Model precision: 0.5945945945945946
Model recall: 0.07055050774986638
Model F1 Score: 0.12613473483038698
Wall time: 1.2 s


##Random Forests Model

In the end, we found that a random forests model performed the best on this data. We did not achieve an accuracy that was wildly better than baseline, but given all of the metrics we used here, it seems that, on the whole, random forests performed well for this binary classification application. Running the classifier on features without normalization seemed to perform the best.

###Normalized Features Attempt

In [17]:
randfor_clf = RandomForestClassifier(n_estimators=40, max_depth=5)
% time randfor_clf.fit(normalized_train, train_set_labels)

Wall time: 7.48 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [20]:
% time get_model_stats(normalized_dev, dev_set_labels, randfor_clf)

T Positive: 1618, F Positive: 1546
F Negative: 3995, T Negative: 16734

The following metrics are on a scale of 0 to 1:
Model accuracy: 0.7680910726991169
Model precision: 0.511378002528445
Model recall: 0.28825939782647425
Model F1 Score: 0.3686908966617295
Wall time: 54.5 s


###Non-normalized Features Attempt

In [29]:
randfor_clf = RandomForestClassifier(n_estimators=40, max_depth=5)
% time randfor_clf.fit(featurized_train, train_set_labels)

Wall time: 4.35 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=40, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [30]:
% time get_model_stats(featurized_dev, dev_set_labels, randfor_clf)

T Positive: 842, F Positive: 344
F Negative: 4771, T Negative: 17936

The following metrics are on a scale of 0 to 1:
Model accuracy: 0.7859205625078475
Model precision: 0.7099494097807757
Model recall: 0.15000890789239266
Model F1 Score: 0.24768348286512723
Wall time: 55.9 s


In [135]:
# accuracy = correct predictions / total predictions
# It is the number of correct predictions made divided by the total number of predictions made

# precision = true positives / (true positives + false positives)
# Precision can be thought of as a measure of a classifiers exactness.
# A low precision can also indicate a large number of False Positives.

# recall = true positives / (true positives + false negatives)
# Recall can be thought of as a measure of a classifiers completeness.
# A low recall indicates many False Negatives.
# f1 = 2 * ((precision * recall) / (precision + recall))

#Running Model on Test Set (Non-normalized Features)

In [31]:
% time get_model_stats(featurized_test, test_labels, randfor_clf)

T Positive: 1143, F Positive: 574
F Negative: 4680, T Negative: 28373

The following metrics are on a scale of 0 to 1:
Model accuracy: 0.8488927236123095
Model precision: 0.6656959813628421
Model recall: 0.19629057187017002
Model F1 Score: 0.303183023872679
Wall time: 1min 20s


In [32]:
print(randfor_clf.feature_importances_)

[ 0.0033745   0.01645483  0.13675816  0.233239    0.12038536  0.07016558
  0.00295394  0.03919613  0.03617639  0.0198593   0.09358085  0.12706954
  0.03867808  0.01311949  0.00468971  0.02569395  0.0186052 ]


In [74]:
max_vals = sorted(randfor_clf.feature_importances_, reverse=True)[:6]

In [77]:
print("Top 5 features in order from most to least important:")
index = 1
for item in max_vals:
    item_index = np.where(randfor_clf.feature_importances_== item)[0][0]
    print("Rank: %-3d |  Feature: %-22s |   Importance score: %f" % (index, feature_dict[item_index], item))
    index += 1

Top 5 features in order from most to least important:
Rank: 1   |  Feature: total paragraphs       |   Importance score: 0.233239
Rank: 2   |  Feature: total characters       |   Importance score: 0.136758
Rank: 3   |  Feature: paragraphs per review  |   Importance score: 0.127070
Rank: 4   |  Feature: total cool votes       |   Importance score: 0.120385
Rank: 5   |  Feature: chars per review       |   Importance score: 0.093581
