In [None]:
%run "../library.py"

## CLUSTERING

In [32]:
cluster_train = np.hstack([train_cnn,train_bow])
cluster_test = np.hstack([test_cnn,test_bow])
cluster_tenk = np.hstack([tenk_cnn,tenk_bow])
full_data = preprocessing.normalize(np.vstack([cluster_train, cluster_test, cluster_tenk]), axis=0)

In [33]:
# spectral clustering
from sklearn import cluster

spectral = cluster.SpectralClustering(n_clusters=200)
spectral_output = spectral.fit(full_data)
spectral_labels = spectral_output.labels_

In [36]:
# we have the 14k by 14k affinity matrix. we can use this to get 
# affinity of training data with tenk data and test data with tenk data
similarity_matrix = spectral.affinity_matrix_[:3000,4000:]
similarity_matrix_test = spectral.affinity_matrix_[3000:4000,4000:]
print similarity_matrix.shape, similarity_matrix_test.shape

(3000, 10000) (1000, 10000)
(3000, 10000) (1000, 10000)


## BAG OF WORDS

In [40]:
from nltk.corpus import stopwords

port = PorterStemmer()
lmtzr = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
stopwords = stopwords.words('english')

def rstrip(l):
    l = ''.join([i if ord(i) < 128 else ' ' for i in l])
    return l.rstrip()


def preprocess(l):
    line = " ".join(map(rstrip, l)).encode("ascii")
    words = tokenizer.tokenize(line)
    
    processed_words = []
    for word in words:
        # lowercase
        new_word = word.lower()
        # remove stop words
        if new_word not in stopwords:
            # stem and lemmatize
            new_word = lmtzr.lemmatize(new_word)
            new_word = port.stem(new_word)
            # add to processed word list
            processed_words.append(new_word)
            
    return processed_words

words_data = map(preprocess, captions_json.values())

In [41]:
def get_dict_of_words(words_data):
    word_set = Set()
    for i in range(len(words_data)):
        for word in words_data[i]:
            word_set.add(word)
            
    return list(word_set)

words_dict = get_dict_of_words(words_data)

In [42]:
print "total number of words/features:", len(words_dict)

total number of words/features: 7509
total number of words/features: 7509


In [43]:
def get_word_count_data(words_data, words_dict, ignore = True):    
    word_counts_data = []
    for i in range(len(words_data)):
        word_counts_row = np.zeros(len(words_dict))
        
        # store list of words for row
        word_list = words_data[i]
        
        # prune to exclude words not in training data's bag of words
        if not ignore:
            word_list = list(set(word_list).intersection(words_dict))
        
        # store count of words for above list
        word_counts = Counter(word_list)
        
        for w in word_counts.keys():
            index = words_dict.index(w)
            word_counts_row[index] = word_counts[w]
            
        word_counts_data.append(word_counts_row)
            
    return np.array(word_counts_data)

# bag of words data
word_counts_data = get_word_count_data(words_data, words_dict)

In [44]:
# normalize featureset
bow_data = normalize(word_counts_data, axis=1, norm='l1')

## CLASSIFICATION

In [46]:
# take dot product
print similarity_matrix.shape, bow_data.shape
train_bow_data = similarity_matrix.dot(bow_data)
train_bow_data = preprocessing.scale(train_bow_data, axis=0)

(3000, 10000) (10000, 7509)
(3000, 10000) (10000, 7509)


In [47]:
print similarity_matrix_test.shape, bow_data.shape
test_bow_data = similarity_matrix_test.dot(bow_data)
test_bow_data = preprocessing.scale(test_bow_data, axis=0)

(1000, 10000) (10000, 7509)
(1000, 10000) (10000, 7509)


In [48]:
# we can use this as our feature vector to do a logistic regression
print train_bow_data.shape, test_bow_data.shape

(3000, 7509) (1000, 7509)
(3000, 7509) (1000, 7509)


In [49]:
train_indices, test_indices = [], []
for train_index, test_index in KFold(len(train_bow_data), n_folds=5):
    train_indices = train_index
    test_indices = test_index
x_train, x_test = train_bow_data[train_indices], train_bow_data[test_indices]
y_train, y_test = train_labels[train_indices], train_labels[test_indices]

print x_train.shape, x_test.shape, y_train.shape, y_test.shape

# save the train and test indices so that the same values are used in the classification section
np.savetxt('train_indices.txt', train_indices)
np.savetxt('test_indices.txt', test_indices)

(2400, 7509) (600, 7509) (2400,) (600,)
(2400, 7509) (600, 7509) (2400,) (600,)


In [53]:
# svm classifier on the single fold from training data
svmmodel_bow_data = svm.SVC(kernel='linear', probability=True).fit(x_train,y_train) 
score_bow = svmmodel_bow_data.score(x_test,y_test) 
print score_bow

0.291666666667
0.291666666667


## KAGGLE OUTPUT

In [54]:
# svm classifier
svmmodel_bow_data_full = svm.SVC(kernel='linear', probability=True).fit(train_bow_data, train_labels) 
predictions_cluster_full = svmmodel_bow_data_full.predict_proba(test_bow_data) 
np.savetxt('predictions_cluster.txt', predictions_cluster_full)

In [57]:
predicted_labels = svmmodel_bow_data.predict(x_test)
# confusion_matrix(y_test, svmmodel_bow_data.predict(x_test))

In [77]:
# confusion_matrix(y_test, predicted_labels)
# np.savetxt("ytestcluster.txt", y_test, delimiter=" ", fmt="%s")
# np.savetxt('predictedlabelscluster.txt', predicted_labels, delimiter=" ", fmt="%s")
# np.genfromtxt()


In [78]:
words

'lawn\ndowntown\nfountain\nfood_court\nfastfood_restaurant\ncoffee_shop\nwindmill\nrainforest\nfort\ncomputer_room\nbus_interior\nkitchen\ntower\ngift_shop\ncastle\nlawn\nlimousine_interior\ncottage_garden\ncomputer_room\ndining_car\nairfield\nrock_arch\nwatering_hole\ncreek\nchapel\nshopfront\nbaseball_field\nnightclub\nfort\nglacier\nnightclub\nfarm\nmanhole\nbanquet_hall\nshower\nfort\noffice\nmountain_road\nauditorium\nairplane_cabin\nlab_classroom\nforest_road\nsubmarine_interior\nharbor\nrestaurant\nocean\ntea_garden\njewelry_shop\nbeach\nski_resort\nshopfront\npiano_store\ndining_room\nfootball_field\nsandbar\nwatering_hole\ntower\nswamp\ngorge\nriver\namusement_park\nkindergarden_classroom\nshoe_shop\npet_shop\nplaza\ngas_station\nart_gallery\nbarn\nranch\nlecture_room\namusement_park\nhayfield\ncanteen\ncomputer_room\nreception\nbar\nclassroom\nmountain_road\nchapel\namphitheater\nmusic_studio\nreading_room\narmory\nlab_classroom\nhospital\nharbor\nsawmill\nfreeway\ncemetery\n

'lawn\ndowntown\nfountain\nfood_court\nfastfood_restaurant\ncoffee_shop\nwindmill\nrainforest\nfort\ncomputer_room\nbus_interior\nkitchen\ntower\ngift_shop\ncastle\nlawn\nlimousine_interior\ncottage_garden\ncomputer_room\ndining_car\nairfield\nrock_arch\nwatering_hole\ncreek\nchapel\nshopfront\nbaseball_field\nnightclub\nfort\nglacier\nnightclub\nfarm\nmanhole\nbanquet_hall\nshower\nfort\noffice\nmountain_road\nauditorium\nairplane_cabin\nlab_classroom\nforest_road\nsubmarine_interior\nharbor\nrestaurant\nocean\ntea_garden\njewelry_shop\nbeach\nski_resort\nshopfront\npiano_store\ndining_room\nfootball_field\nsandbar\nwatering_hole\ntower\nswamp\ngorge\nriver\namusement_park\nkindergarden_classroom\nshoe_shop\npet_shop\nplaza\ngas_station\nart_gallery\nbarn\nranch\nlecture_room\namusement_park\nhayfield\ncanteen\ncomputer_room\nreception\nbar\nclassroom\nmountain_road\nchapel\namphitheater\nmusic_studio\nreading_room\narmory\nlab_classroom\nhospital\nharbor\nsawmill\nfreeway\ncemetery\n

In [51]:
# imports
import numpy as np
import matplotlib.pyplot as plt
import csv
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sets import Set
from collections import Counter
from sklearn.preprocessing import normalize
import random
from scipy.spatial import distance
import scipy
import math
import sklearn
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import heapq
import string
from sklearn.decomposition import PCA
from sklearn.cross_validation import KFold
from sklearn import preprocessing
from collections import Counter
from sklearn import svm