In [1]:
import cv2
import numpy as np
import os
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
from scipy.cluster.vq import *
import seaborn as sns

from sklearn.preprocessing import StandardScaler
import glob
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import pandas as pd
from skimage import data
from skimage.feature import hog

%matplotlib inline
#from helper import *
import helper
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.cross_validation import StratifiedShuffleSplit
from functools import partial
import datetime
import pickle

In [2]:
# sklearn models
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import scale

# sklearn metrics
from sklearn.metrics import *
from sklearn.learning_curve import learning_curve
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split

import helper

In [3]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [4]:
train_folder= 'data/train'
train_label='data/trainLabels.csv'

In [5]:
pwd

u'/home/jshoun01/Data/kaggle/cifar10'

In [6]:
ls

[0m[01;34manalysis[0m/
bag_of_words-checkpoint.ipynb
cookies.txt
[01;34mdata[0m/
fiinal_model.ipynb
helper.pyc
model-checkpoint.ipynb
OCR for Street View Text - Research-checkpoint.ipynb
real_model.ipynb
[01;34mresults[0m/
Untitled3-checkpoint.ipynb
Xtest.pkl
Xtrain.pkl


In [7]:
all_train_df= helper.get_cifar_training_data(train_folder,train_label)
all_train_df.head()

Unnamed: 0,id,label,filename,full_filename
0,1,frog,1.png,data/train/1.png
1,2,truck,2.png,data/train/2.png
2,3,truck,3.png,data/train/3.png
3,4,deer,4.png,data/train/4.png
4,5,automobile,5.png,data/train/5.png


In [8]:
split_idxes=next(iter(StratifiedShuffleSplit(all_train_df['label'],n_iter =1,test_size=0.30,random_state=20)))
train_index, test_index = split_idxes

In [9]:
features=all_train_df[['id','filename','full_filename']]
labels=all_train_df['label']


In [10]:
train_img_gen=helper.load_images(features['full_filename'])

In [11]:
X_train_df,X_test_df,y_train,y_test= helper.get_stratified_train_test_split(
    features,labels)

In [12]:
X_train_df.head()

Unnamed: 0,id,filename,full_filename
1401,1402,1402.png,data/train/1402.png
168,169,169.png,data/train/169.png
46471,46472,46472.png,data/train/46472.png
37950,37951,37951.png,data/train/37951.png
26285,26286,26286.png,data/train/26286.png


In [13]:
X_test_df.head()

Unnamed: 0,id,filename,full_filename
39264,39265,39265.png,data/train/39265.png
18869,18870,18870.png,data/train/18870.png
11313,11314,11314.png,data/train/11314.png
26085,26086,26086.png,data/train/26086.png
22976,22977,22977.png,data/train/22977.png


In [14]:
cch_1= helper.ColorChannelStatistics(sub_regions=1)
cch_2= helper.ColorChannelStatistics(sub_regions=2)
cch_4= helper.ColorChannelStatistics(sub_regions=4)

cc_pipeline = FeatureUnion([("cch_1", cch_1),("cch_2", cch_2),("cch_4", cch_4)])
hog_pipeline= Pipeline([("gt", helper.GrayScaleImageTransform())
                        ,("ht", helper.HogStatistics())])

In [15]:
train_img_gen=helper.load_images(X_train_df['full_filename'])
test_img_gen=helper.load_images(X_test_df['full_filename'])


In [16]:
feature_extractors=[cc_pipeline,hog_pipeline]
#X_train=helper.extract_image_features(train_img_gen,feature_extractors)
#X_test=helper.extract_image_features(test_img_gen,feature_extractors)


In [None]:
print(datetime.datetime.now())
X_train=helper.extract_multiple_image_features(feature_extractors,X_train_df['full_filename'])
print(datetime.datetime.now())

In [None]:
print(datetime.datetime.now())
X_test=helper.extract_multiple_image_features(feature_extractors,X_test_df['full_filename'])
print(datetime.datetime.now())

In [None]:
pickle.dump(X_train,open('Xtrain.pkl',"wb"))
pickle.dump(X_test,open('Xtest.pkl',"wb"))

In [None]:
X_train = pickle.load(open('Xtrain.pkl',"rb"))
X_test = pickle.load(open('Xtest.pkl',"rb"))

In [None]:
len(X_train)

In [None]:
X_train[0]

In [None]:
X_train_simp=map(lambda x:x[0], X_train)
X_test_simp=map(lambda x:x[0], X_test)

In [None]:
models = {'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5), 
          'Gaussian Naive Bayes': GaussianNB(),
          'Random Forest Classifier': RandomForestClassifier()
          
         }

In [None]:
ls

In [None]:
knn_model = KNeighborsClassifier(n_neighbors=5).fit(X_train_simp, y_train)


In [None]:
helper

In [None]:
helper.plot_confusion_matrix

In [None]:
helper.plot_confusion_matrix(knn_model,y_test,X_test_simp)

In [None]:
y_predicted=model.predict(X_test_simp)

In [None]:
label_names = model.classes_
cm = confusion_matrix(y_test, y_predicted)
cm = np.round(cm.astype('float') / cm.sum(axis=1)[:, np.newaxis], 2)
sns.heatmap(cm,  annot=True,  fmt='', xticklabels=label_names, yticklabels=label_names);
print(accuracy_score(y_test, y_predicted))


In [None]:
gm_model=GaussianNB().fit(X_train_simp, y_train)
helper.plot_confusion_matrix(y_test,X_train_simp)

In [None]:
gm_model=GaussianNB().fit(X_train_simp, y_train)
helper.plot_confusion_matrix(y_test,X_train_simp)
y_predicted=model.predict(X_test_simp)
label_names = model.classes_
cm = confusion_matrix(y_test, y_predicted)
sns.heatmap(cm, annot=True,fmt='', xticklabels=label_names, yticklabels=label_names);
print(accuracy_score(y_test, y_predicted))
#print(roc_auc_score(y_test, y_predicted))

In [None]:
model=RandomForestClassifier().fit(X_train_simp, y_train)
y_predicted=model.predict(X_test_simp)
label_names = model.  
cm = confusion_matrix(y_test, y_predicted)
sns.heatmap(cm, annot=True,  fmt='', xticklabels=label_names, yticklabels=label_names);
print(accuracy_score(y_test, y_predicted))
#print(roc_auc_score(y_test, y_predicted))

In [None]:
model=BernoulliNB().fit(X_train_simp, y_train)
y_predicted=model.predict(X_test_simp)
label_names = model.classes_
cm = confusion_matrix(y_test, y_predicted)
sns.heatmap(cm, annot=True,  fmt='', xticklabels=label_names, yticklabels=label_names);
print(accuracy_score(y_test, y_predicted))
#print(roc_auc_score(y_test, y_predicted))

In [None]:
model=SVC().fit(X_train_simp, y_train)
y_predicted=model.predict(X_test_simp)
label_names = model.classes_
cm = confusion_matrix(y_test, y_predicted)
sns.heatmap(cm, annot=True,  fmt='', xticklabels=label_names, yticklabels=label_names);
print(accuracy_score(y_test, y_predicted))
#print(roc_auc_score(y_test, y_predicted))