In [None]:
%run "../library.py"

## CLUSTERING

In [2]:
from sklearn import preprocessing
from collections import Counter

In [170]:
cluster_train = preprocessing.normalize(np.hstack([train_cnn,train_bow]), axis=0)
cluster_test = preprocessing.normalize(np.hstack([test_cnn,test_bow]), axis=0)
cluster_tenk = preprocessing.normalize(np.hstack([tenk_cnn,tenk_bow]), axis=0)
full_data = np.vstack([cluster_train, cluster_tenk])
full_data_test = np.vstack([cluster_test, cluster_tenk])

In [4]:
# spectral clustering
from sklearn import cluster

spectral = cluster.SpectralClustering(n_clusters=200)
spectral_output = spectral.fit(full_data)
spectral_labels = spectral_output.labels_

In [171]:
# spectral clustering test data

spectral_test = cluster.SpectralClustering(n_clusters=200)
spectral_output_test = spectral_test.fit(full_data_test)
spectral_labels_test = spectral_output_test.labels_

In [20]:
# this is a 13k by 13k affinity matrix. we can use this to get 
# affinity of training data with tenk data
similarity_matrix = spectral.affinity_matrix_[:3000,3000:]

In [173]:
similarity_matrix_test = spectral_test.affinity_matrix_[:1000,1000:]

## BAG OF WORDS

In [102]:
from nltk.corpus import stopwords

port = PorterStemmer()
lmtzr = WordNetLemmatizer()
tokenizer = RegexpTokenizer(r'\w+')
stopwords = stopwords.words('english')

def rstrip(l):
    l = ''.join([i if ord(i) < 128 else ' ' for i in l])
    return l.rstrip()


def preprocess(l):
    line = " ".join(map(rstrip, l)).encode("ascii")
    words = tokenizer.tokenize(line)
    
    processed_words = []
    for word in words:
        # lowercase
        new_word = word.lower()
        # remove stop words
        if new_word not in stopwords:
            # stem and lemmatize
            new_word = lmtzr.lemmatize(new_word)
            new_word = port.stem(new_word)
            # add to processed word list
            processed_words.append(new_word)
            
    return processed_words

words_data = map(preprocess, captions_json.values())

In [106]:
def get_dict_of_words(words_data):
    word_set = Set()
    for i in range(len(words_data)):
        for word in words_data[i]:
            word_set.add(word)
            
    return list(word_set)

words_dict = get_dict_of_words(words_data)

In [109]:
len(words_dict)

7509

In [169]:
def get_word_count_data(words_data, words_dict, ignore = True):    
    word_counts_data = []
    for i in range(len(words_data)):
        word_counts_row = np.zeros(len(words_dict))
        
        # store list of words for row
        word_list = words_data[i]
        
        # prune to exclude words not in training data's bag of words
        if not ignore:
            word_list = list(set(word_list).intersection(words_dict))
        
        # store count of words for above list
        word_counts = Counter(word_list)
        
        for w in word_counts.keys():
            index = words_dict.index(w)
            word_counts_row[index] = word_counts[w]
            
        word_counts_data.append(word_counts_row)
            
    return np.array(word_counts_data)

# bag of words data
word_counts_data = get_word_count_data(words_data, words_dict)

In [151]:
# normalize
bow_data = normalize(word_counts_data, axis=1, norm='l1')

1.0

## CLASSIFICATION

In [160]:
# take dot product
print similarity_matrix.shape, bow_data.shape
train_bow_data = similarity_matrix.dot(bow_data)

(3000, 10000) (10000, 7509)


In [None]:
print similarity_matrix_test.shape, bow_data.shape
test_bow_data = similarity_matrix_test.dot(bow_data)

In [177]:
# we can use this as our feature vector to do a logistic regression
print train_bow_data.shape, test_bow_data.shape

(3000, 7509) (1000, 7509)


In [165]:
train_indices, test_indices = [], []
for train_index, test_index in KFold(len(train_bow_data), n_folds=5):
    train_indices = train_index
    test_indices = test_index
x_train, x_test = train_bow_data[train_indices], train_bow_data[test_indices]
y_train, y_test = train_labels[train_indices], train_labels[test_indices]

print x_train.shape, x_test.shape, y_train.shape, y_test.shape

(2400, 7509) (600, 7509) (2400,) (600,)


In [166]:
# logistic regression classifier
lr = LogisticRegression()

# Train the algorithm using all the training data
lr.fit(x_train, y_train)

# Accuracy on the training set
lr.score(x_test, y_test)

0.15833333333333333

## KAGGLE OUTPUT

In [179]:
# logistic regression classifier
lr_cluster = LogisticRegression()

# Train the algorithm using all the training data
lr_cluster.fit(train_bow_data, train_labels)

# Accuracy on the training set
predictions_cluster = lr_cluster.predict_proba(test_bow_data)

In [183]:
np.savetxt('predictions_cluster.txt', predictions_cluster)

In [185]:
pctest = np.loadtxt('predictions_cluster.txt')

In [186]:
pctest.shape

(1000, 200)

In [163]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import csv
import nltk
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from sets import Set
from collections import Counter
from sklearn.preprocessing import normalize
import random
from scipy.spatial import distance
import scipy
import math
import sklearn
from sklearn import cross_validation
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import heapq
import string
from sklearn.decomposition import PCA
from sklearn.cross_validation import KFold