In [None]:
import os
import csv
import json

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#==============================================================================
# Load in text data, split into train/test 
#==============================================================================

# load in data
os.chdir("/Users/natashal/Projects/consulting/peoplelikeme/analysis/")
reviews = pd.read_json('reviews2_30000.json')

# what's the shape (rows, columns) of the data?
print(reviews.shape); type(reviews)

# get just the body and rating out for now
reviews_br = reviews[["body", "rating"]]
reviews_br.head

# drop entries with null values, check shape afterwards
reviews_br = reviews_br.dropna(how='any')
reviews_br.shape

# split my data into train and test sets
from sklearn.cross_validation import train_test_split
train, test = train_test_split(reviews_br, test_size=0.2, random_state=42)
train.shape; test.shape
#==============================================================================
# Process description fields of train set
#==============================================================================

# tokenize the text using countvectoriser
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(lowercase=True, stop_words='english', strip_accents='unicode')
count_vect.get_stop_words()

# if wanting to use n-grams
count_vect = CountVectorizer(analyzer='word', ngram_range=(1,2), lowercase=True, stop_words='english', strip_accents='unicode')

# fit the count vectoriser
X_train_counts = count_vect.fit_transform(train.body)
X_train_counts.shape
count_vect.get_feature_names()[100:1000]

# get term frequencies (tf), scale by inverse document frequenies (idf)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(use_idf=True, smooth_idf=True)

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

# explore the matrix by converting back to dense format
dense=X_train_tfidf.todense()
dense[1,1]

#==============================================================================
# Training classifiers 
#==============================================================================
###################### 1) Start with naive bayes
from sklearn.naive_bayes import MultinomialNB
classifier_NB = MultinomialNB()
# train NB classifier
classifier_NB_fit = classifier_NB.fit(X_train_tfidf, train.rating)
                    
# predict ratings on test set using model                    
test_counts = count_vect.transform(test.body)
test_tfidf = tfidf_transformer.transform(test_counts)
predicted_nb = classifier_NB_fit.predict(test_tfidf)

## get accuracy i.e. how often the predicted value eqausl the target values
print("NB accuracy", np.mean(predicted_nb == test.rating))

####################### 2) train the linear SVM 
from sklearn.linear_model import SGDClassifier
classifier_svm = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)
classifier_svm_fit = classifier_svm.fit(X_train_tfidf, train.rating)                               
                               
# predict ratings on test set using model
predicted_svm = classifier_svm_fit.predict(test_tfidf) 

# get accuracy again, to compare to NB
print("SVM accuracy", np.mean(predicted_svm == test.rating))

#==============================================================================
# Detailed performance metrics
#==============================================================================
# write out classification performance report
from sklearn import metrics
report = metrics.classification_report(test.rating, predicted_svm)
print(report)

# write out confusion matrix
confusion = metrics.confusion_matrix(test.rating, predicted_svm)
print(confusion)

def plot_confusion_matrix(confusion_matrix, title="Confusion matrix"):
    plt.matshow(confusion_matrix) 
    plt.xticks([0, 1, 2, 3, 4], [1, 2, 3, 4, 5])
    plt.yticks([0, 1, 2, 3, 4], [1, 2, 3, 4, 5])
    plt.colorbar()
    plt.title(title)
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

plot_confusion_matrix(confusion)

# Normalize the confusion matrix by row (i.e by the number of samples
# in each class)
confusion_normalized = confusion.astype('float') / confusion.sum(axis=1)[:, np.newaxis]
print(confusion_normalized)
plot_confusion_matrix(confusion_normalized, title="Normalised confusion matrix")

















