## Enable inline plotting

In [None]:
# this is a so-called 'magic function' allowing plotting within your notebook
% matplotlib notebook

## Import relevant general modules

In [None]:
import sklearn
from sklearn import datasets
from sklearn import metrics
import skimage
from skimage import io
import re
import numpy as np
import os.path
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.colors import ListedColormap

## Import our local modules

In [None]:
import cs4125_util
import stupid_systems
# you will similarly import groupXX if you write your code in groupXX.py

## For the pattern recognition people

In [None]:
# load a famous classification dataset
iris_data = datasets.load_iris()
iris_data.keys()

In [None]:
# what features does this dataset have?
iris_data['feature_names']

In [None]:
# what does the data look like?
iris_data['data']

In [None]:
# let's plot the first two features

# first prepare a colormap (red, blue, green) for class (0, 1, 2)
colormap = cm.RdBu
cm_bright = ListedColormap(['#FF0000', '#0000FF', '#00FF00'])

plt.figure()
plt.scatter(iris_data['data'][:,0], iris_data['data'][:,1], c=iris_data['target'], cmap=cm_bright)
plt.xlabel(iris_data['feature_names'][0])
plt.ylabel(iris_data['feature_names'][1])
plt.show()

## On towards our own examples

In [None]:
# set the path to the PhotoTweet dataset folder
data_folder = 'your/data/folder'

In [None]:
# load the dataset using the utility code
phototweet_dataset = cs4125_util.initialize_phototweet_dataset(data_folder)

In [None]:
# inspect keys
phototweet_dataset.keys()

## Consider some bogus features

In [None]:
red_pixel_data = stupid_systems.get_ten_red_pixels(phototweet_dataset, data_folder)
token_count = stupid_systems.count_tokens(phototweet_dataset, data_folder)

In [None]:
sum_first_ten_red = np.sum(red_pixel_data, axis=1)

In [None]:
# if you want to experiment with how np.sum works again, you can always create a dummy test
test = np.zeros((3,5))
test.shape

In [None]:
test[2,3] = 5
test

In [None]:
np.sum(test, axis=1)

In [None]:
# can we see any separable information?

# plot red and blue
cm_bright = ListedColormap(['#FF0000', '#0000FF'])

plt.figure()
plt.scatter(token_count, sum_first_ten_red, c=phototweet_dataset['target'], cmap=cm_bright)
plt.xlabel('token count')
plt.ylabel('sum of first ten red pixels')
plt.show()

In [None]:
# ...not really, but let's make a local function that makes a stupid label prediction based on this info

def give_bogus_heuristic_answers(token_count, sum_first_ten_red):
    #bogus_heuristic_answers = np.zeros(len(token_count))
    for i in range(0, len(token_count)):
        if token_count[i] < 200 and sum_first_ten_red[i] < 1000:
            bogus_heuristic_answers[i] = 0
        else:
            bogus_heuristic_answers[i] = 1
    return bogus_heuristic_answers

In [None]:
bogus_answers = give_bogus_heuristic_answers(token_count, sum_first_ten_red)

## Evaluate your results

In [None]:
# If you used a heuristic without training: directly comparing predicted system output to actual output
# Note that we actually use 'training' (observed) and validation data intermingled here.
# If you ever implement a heuristic in the future, it is better to hold out validation data to increase generalizibility.

predicted_labels = stupid_systems.always_say_one(phototweet_dataset)

print metrics.confusion_matrix(phototweet_dataset['target'], predicted_labels)
print metrics.accuracy_score(phototweet_dataset['target'], predicted_labels)

In [None]:
# Sample code for training. Let's say we use the 10-D red pixel data, and try a Naive Bayesian classifier.

# import the classifier and initalize it (consult the sklearn documentation for many more options)
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

data = stupid_systems.get_ten_red_pixels(phototweet_dataset, data_folder)
cs4125_util.validate_kfold(data, phototweet_dataset['target'], gnb)