In [2]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import gensim as gs
import nltk.data
from nltk.corpus import stopwords
from nltk.stem.porter import *
import string
import re
from collections import Counter
import scipy as scp
import matplotlib.cm as cm
from sklearn.feature_extraction import DictVectorizer
from sklearn.svm import SVC
from sklearn.decomposition import PCA, IncrementalPCA, KernelPCA
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor, RandomForestRegressor, AdaBoostRegressor, RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn import decomposition, pipeline, metrics, grid_search
from sklearn import cross_validation
from sklearn.preprocessing import PolynomialFeatures
import itertools
from sklearn.cross_validation import StratifiedKFold
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, ElasticNet 
import sys
import os
from sklearn.naive_bayes import GaussianNB
from gensim.models import Word2Vec
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics.pairwise import pairwise_distances
from HTMLParser import HTMLParser

In [None]:
# basic preprocessing
col_types = {u'id': np.int, u'query': np.str, u'product_title': np.str, 
             u'product_description': np.str, u'median_relevance': np.int, u'relevance_variance': np.float}
srt_lowerizer = lambda s: s.lower()
convertors = {u'query': srt_lowerizer, u'product_title': srt_lowerizer, u'product_description': srt_lowerizer}
df = pd.concat([pd.read_csv('./../data/raw/train.csv', dtype=col_types, converters=convertors, index_col=u'id'), \
                pd.read_csv('./../data/raw/test.csv', dtype=col_types, converters=convertors, index_col=u'id')])

idx_row_train = np.where(np.array((~df.median_relevance.isnull()).tolist()))[0]
idx_row_test = np.where(np.array((df.median_relevance.isnull()).tolist()))[0]

y_train = df['median_relevance'].values[idx_row_train].astype(int)

df.product_description = df.product_description.fillna('')

df['q_len'] = df['query'].apply(lambda s: len(s))
df['t_len'] = df['product_title'].apply(lambda s: len(s))
df['d_len'] = df['product_description'].apply(lambda s: len(s))
df['d_loglen'] = df['product_description'].apply(lambda s: np.log(len(s) + 1))
df['d_exist'] = df['product_description'].apply(lambda s: 1.0 if len(s) > 0 else 0.0)


# remove html and non-ascii
class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ' '.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    return s.get_data()

df['q'] = df['query'].apply(
    lambda s: ''.join([w for w in strip_tags(s.decode('ascii', 'ignore'))]))
df['t'] = df['product_title'].apply(
    lambda s: ''.join([w for w in strip_tags(s.decode('ascii', 'ignore'))]))
df['d'] = df['product_description'].apply(
    lambda s: ''.join([w for w in strip_tags(s.decode('ascii', 'ignore'))]))


# remove punctuation
# c = clean
puncts = ''.join(set(string.punctuation + '\t\n'))
r = re.compile(r'[\s{}]+'.format(re.escape(puncts)))

def clean_string(s):
    return ' '.join([w for w in r.split(s)]).strip()

df['qc'] = df['q'].apply(clean_string)
df['tc'] = df['t'].apply(clean_string)
df['dc'] = df['d'].apply(clean_string)

# s = sentences
df['dcs'] = df['d'].apply(lambda d: list([s for s in map(clean_string, re.split('\n.!?', d)) if len(s) > 0]))
df['d_numsent'] = df['dcs'].apply(lambda p: len(p))
df['d_lognumsent'] = df['d_numsent'].apply(lambda x: np.log(x + 1))


# remove stop words
# ns = no stop words
stopword_set = set(stopwords.words('english'))

def remove_stop_words(s):
    return ' '.join([w for w in s.split(' ') if w not in stopword_set])

df['qcns'] = df['qc'].apply(remove_stop_words)
df['tcns'] = df['tc'].apply(remove_stop_words)
df['dcns'] = df['dc'].apply(remove_stop_words)

df['dcsns'] = df['dcs'].apply(lambda p: map(remove_stop_words, p))


# stemming
# nss = no top words and stemmed
stemmer = PorterStemmer()

def stem(s):
    return ' '.join([stemmer.stem(w) for w in s.split(' ')])

df['qcnss'] = df['qcns'].apply(stem)
df['tcnss'] = df['tcns'].apply(stem)
df['dcnss'] = df['dcns'].apply(stem)

df['dcsnss'] = df['dcsns'].apply(lambda p: map(stem, p))


#sop - Simpson Overlap coefficient: f(A, B) = n(A ∩ B) / min( n(A), n(B))

def overlap_coefficient(A, B):
    if len(A) == 0 or len(B) == 0:
        return 0
    return len(A.intersection(B))/float(min(len(A), len(B)))

df['sop_qt_cnss'] = \
    df.apply(lambda r: overlap_coefficient(set(r['qcnss'].split(' ')), set(r['tcnss'].split(' '))), axis=1)

df['sop_qd_cnss'] = \
    df.apply(lambda r: overlap_coefficient(set(r['qcnss'].split(' ')), set(r['dcnss'].split(' '))), axis=1)

df['tmp_col'] = \
    df.apply(lambda r: \
             map(lambda s: overlap_coefficient(set(r['qcnss'].split(' ')), set(s.split(' '))), r['dcsnss']), axis=1)

df['sop_min_qds_cnss'] = df['tmp_col'].apply(lambda v: np.min(v) if len(v) > 0 else 0)
df['sop_max_qds_cnss'] = df['tmp_col'].apply(lambda v: np.max(v) if len(v) > 0 else 0)
df['sop_mean_qds_cnss'] = df['tmp_col'].apply(lambda v: np.mean(v) if len(v) > 0 else 0)
df['sop_median_qds_cnss'] = df['tmp_col'].apply(lambda v: np.median(v) if len(v) > 0 else 0)


#jsc - Jaccard similarity coefficient: f(A, B) = n(A ∩ B) / n(A ∪ B)

def jaccard_coefficient(A, B):
    if len(A) == 0 and len(B) == 0:
        return 0
    return len(A.intersection(B))/float(len(A.union(B)))

df['jsc_qt_cnss'] = \
    df.apply(lambda r: jaccard_coefficient(set(r['qcnss'].split(' ')), set(r['tcnss'].split(' '))), axis=1)

df['jsc_qd_cnss'] = \
    df.apply(lambda r: jaccard_coefficient(set(r['qcnss'].split(' ')), set(r['dcnss'].split(' '))), axis=1)

df['tmp_col'] = \
    df.apply(lambda r: \
             map(lambda s: jaccard_coefficient(set(r['qcnss'].split(' ')), set(s.split(' '))), r['dcsnss']), axis=1)

df['jsc_min_qds_cnss'] = df['tmp_col'].apply(lambda v: np.min(v) if len(v) > 0 else 0)
df['jsc_max_qds_cnss'] = df['tmp_col'].apply(lambda v: np.max(v) if len(v) > 0 else 0)
df['jsc_mean_qds_cnss'] = df['tmp_col'].apply(lambda v: np.mean(v) if len(v) > 0 else 0)
df['jsc_median_qds_cnss'] = df['tmp_col'].apply(lambda v: np.median(v) if len(v) > 0 else 0)

df = df.drop('tmp_col', 1)

# LSA
def cos_sim(A, B):
    v = (A*B).sum(axis=1)/np.sqrt((A**2).sum(axis=1) * (B**2).sum(axis=1))
    v[np.isnan(v)] = 0
    return v

ngram_range_max_grid = [1, 2]
svd_n_components_grid = [100, 200, 300, 400, 500]

tfidf_svd_models = {}
for ngram_range_max in ngram_range_max_grid:
    tfidf_svd_models[ngram_range_max] = {}
    for svd_n_components in svd_n_components_grid:
        tfidf_svd_models[ngram_range_max][svd_n_components] = {}
        print '%i - %i' % (ngram_range_max, svd_n_components)
        tfv = TfidfVectorizer(analyzer='word', ngram_range=(1, ngram_range_max), 
                              sublinear_tf=1, use_idf=1, smooth_idf=1,
                              token_pattern=r'\w{1,}', min_df=3)
        all_texts = list([t for t in df['qcnss'].values.tolist() if len(t) > 0])
        all_texts.extend([t for t in df['tcnss'].values.tolist() if len(t) > 0])
        tfv = tfv.fit(all_texts)
        tf_idf_all_texts = tfv.transform(all_texts)
        svd = TruncatedSVD(n_components=svd_n_components)
        svd = svd.fit(tf_idf_all_texts)
        tfidf_svd_models[ngram_range_max][svd_n_components]['qt'] = (tfv, svd)
        q_nss_tfidf_svd = svd.transform(tfv.transform(df['qcnss'].values.tolist()))
        t_nss_tfidf_svd = svd.transform(tfv.transform(df['tcnss'].values.tolist()))

        df['cos_qt_cnss_tfidf%i_svd%i' % (ngram_range_max, svd_n_components)] = cos_sim(q_nss_tfidf_svd, t_nss_tfidf_svd)
        df['cos_qt_cnss_tfidf%i_svd%i' % (ngram_range_max, svd_n_components)] = \
            df['cos_qt_cnss_tfidf%i_svd%i' % (ngram_range_max, svd_n_components)].fillna(0)
        
        tfv = TfidfVectorizer(analyzer='word', ngram_range=(1, ngram_range_max), 
                              sublinear_tf=1, use_idf=1, smooth_idf=1,
                              token_pattern=r'\w{1,}', min_df=3)
        all_texts = list([t for t in df['qcnss'].values.tolist() if len(t) > 0])
        all_texts.extend([t for t in df['dcnss'].values.tolist() if len(t) > 0])
        tfv = tfv.fit(all_texts)
        tf_idf_all_texts = tfv.transform(all_texts)
        svd = TruncatedSVD(n_components=svd_n_components)
        svd = svd.fit(tf_idf_all_texts)
        tfidf_svd_models[ngram_range_max][svd_n_components]['qd'] = (tfv, svd)
        q_nss_tfidf_svd = svd.transform(tfv.transform(df['qcnss'].values.tolist()))
        d_nss_tfidf_svd = svd.transform(tfv.transform(df['dcnss'].values.tolist()))

        df['cos_qd_cnss_tfidf%i_svd%i' % (ngram_range_max, svd_n_components)] = cos_sim(q_nss_tfidf_svd, d_nss_tfidf_svd)
        df['cos_qd_cnss_tfidf%i_svd%i' % (ngram_range_max, svd_n_components)] = \
            df['cos_qd_cnss_tfidf%i_svd%i' % (ngram_range_max, svd_n_components)].fillna(0)

        ds_nss_tfidf_svd_cos = []
        for i in range(df.shape[0]):
            if len(df['dcsnss'].values[i]) == 0:
                ds_nss_tfidf_svd_cos.append(np.array([0]))
                continue
            tmp_q = np.repeat(q_nss_tfidf_svd[i, :], len(df['dcsnss'].values[i])).reshape(
                (len(df['dcsnss'].values[i]), q_nss_tfidf_svd.shape[1]), order='F')
            tmp_d = svd.transform(tfv.transform(df['dcsnss'].values[i]))
            ds_nss_tfidf_svd_cos.append(cos_sim(tmp_q, tmp_d))

        df['cos_min_qd_cnss_tfidf%i_svd%i' % (ngram_range_max, svd_n_components)] = \
            map(lambda v: np.min(v), ds_nss_tfidf_svd_cos)
        df['cos_max_qd_cnss_tfidf%i_svd%i' % (ngram_range_max, svd_n_components)] = \
            map(lambda v: np.max(v), ds_nss_tfidf_svd_cos)
        df['cos_mean_qd_cnss_tfidf%i_svd%i' % (ngram_range_max, svd_n_components)] = \
            map(lambda v: np.mean(v), ds_nss_tfidf_svd_cos)
        df['cos_median_qd_cnss_tfidf%i_svd%i' % (ngram_range_max, svd_n_components)] = \
            map(lambda v: np.median(v), ds_nss_tfidf_svd_cos)


# Okapi BM25

class OkapiBM25:
    
    cv = None
    N = 1
    word_doc_freq = {}
    k = 1.5
    b = 0.75
    avgdl = 0
    
    def __init__(self, max_n_gram=2, k=1.5, b=0.75):
        self.cv = CountVectorizer(analyzer='word', ngram_range=(1, max_n_gram), stop_words=None)
        self.k = k        
        self.b = b
    
    def fit(self, X):
        self.N = X.shape[0]
        X = self.cv.fit_transform(X)
        self.avgdl = X.sum()/float(X.shape[0])
        self.word_doc_freq = dict(
            zip(map(lambda p: p[0], sorted(self.cv.vocabulary_.items(), key=lambda p: p[1])), 
                np.array(X.sum(axis=0), dtype=np.int).flatten().tolist()))
            
    def idf(self, w):
        n = self.word_doc_freq[w] if w in self.word_doc_freq else 0
        return np.log((self.N - n + 0.5)/(n + 0.5))
    
    def f(self, w, d):
        return sum(map(lambda x: 1 if x == w else 0, d.split()))
    
    def score_word(self, w, d):
        fqd = self.f(w, d)
        return self.idf(w) * (fqd * (self.k + 1))/(fqd + self.k*(1 - self.b + self.b*(len(d.split()))/self.avgdl))
    
    def score_query(self, q, d):
        return sum(map(lambda w: self.score_word(w, d), q.split()))

ngram_range_max_grid = range(1, 4)
for ngram_range_max in ngram_range_max_grid:
    print ngram_range_max
    
    obm25 = OkapiBM25(max_n_gram=ngram_range_max)
    obm25.fit(df['tcnss'].values)
    df['obm25_%i_qt_cnss' % ngram_range_max] = \
        map(lambda p: obm25.score_query(p[0], p[1]), zip(df['qcnss'].values, df['tcnss'].values))
    
    obm25 = OkapiBM25(max_n_gram=ngram_range_max)
    obm25.fit(df['dcnss'].values)
    df['obm25_%i_qd_cnss' % ngram_range_max] = \
        map(lambda p: obm25.score_query(p[0], p[1]), zip(df['qcnss'].values, df['dcnss'].values))
    
    sent_scores = map(lambda p: np.array(map(lambda d: obm25.score_query(p[0], d), p[1])), 
                      zip(df['qcnss'].values, df['dcsnss'].values))
    
    df['obm25_%i_min_qd_cnss' % ngram_range_max] = map(lambda v: np.min(v) if len(v) > 0 else 0, sent_scores)
    df['obm25_%i_max_qd_cnss' % ngram_range_max] = map(lambda v: np.max(v) if len(v) > 0 else 0, sent_scores)
    df['obm25_%i_mean_qd_cnss' % ngram_range_max] = map(lambda v: np.mean(v) if len(v) > 0 else 0, sent_scores)
    df['obm25_%i_median_qd_cnss' % ngram_range_max] = map(lambda v: np.median(v) if len(v) > 0 else 0, sent_scores)


# word2vec similarities
path = './../data/w2v_models/'
w2v_models = {}
for name in os.listdir(path):
    w2v_models[name] = Word2Vec.load(os.path.join(path, name))

# w2v: cos similarities between mean vectors

for w2v_model_key in w2v_models.keys():
    print w2v_model_key
    
    w2v = w2v_models[w2v_model_key]
    Q_mean = np.array(map(lambda s: 
                      reduce(lambda x, y: x + y, 
                             map(lambda w: w2v[w] if w in w2v else np.zeros(w2v['foot'].shape), s.split()), 
                             np.zeros(w2v['foot'].shape))/len(s.split()), 
                      df['qcnss'].values))
    T_mean = np.array(map(lambda s: 
                      reduce(lambda x, y: x + y, 
                             map(lambda w: w2v[w] if w in w2v else np.zeros(w2v['foot'].shape), s.split()), 
                             np.zeros(w2v['foot'].shape))/len(s.split()), 
                      df['tcnss'].values))
    # D_mean = np.array(map(lambda s: 
    #                   reduce(lambda x, y: x + y, 
    #                          map(lambda w: w2v[w] if w in w2v else np.zeros(w2v['foot'].shape), s.split()), 
    #                          np.zeros(w2v['foot'].shape))/len(s.split()), 
    #                   df['dcnss'].values))
    D_list_mean = []
    for d in df['dcsnss'].values:
        D_list_mean.append(
            np.array(map(lambda s: 
                     reduce(lambda x, y: x + y, 
                            map(lambda w: w2v[w] if w in w2v else np.zeros(w2v['foot'].shape), s.split()), 
                            np.zeros(w2v['foot'].shape))/len(s.split()), 
                     d)))

    df['cos_qt_cnss_%s' % w2v_model_key.replace('.bin', '')] = cos_sim(Q_mean, T_mean)

    ds_nss_w2v_cos = []
    for i in range(df.shape[0]):
        if len(df['dcsnss'].values[i]) == 0:
            ds_nss_w2v_cos.append(np.array([0]))
            continue
        tmp_q = np.repeat(Q_mean[i, :], len(df['dcsnss'].values[i])).reshape(
            (len(df['dcsnss'].values[i]), Q_mean.shape[1]), order='F')
        tmp_d = D_list_mean[i]
        ds_nss_w2v_cos.append(cos_sim(tmp_q, tmp_d))

    df['cos_min_qd_cnss_%s' % w2v_model_key.replace('.bin', '')] = \
        map(lambda v: np.min(v), ds_nss_w2v_cos)
    df['cos_max_qd_cnss_%s' % w2v_model_key.replace('.bin', '')] = \
        map(lambda v: np.max(v), ds_nss_w2v_cos)
    df['cos_mean_qd_cnss_%s' % w2v_model_key.replace('.bin', '')] = \
        map(lambda v: np.mean(v), ds_nss_w2v_cos)
    df['cos_median_qd_cnss_%s' % w2v_model_key.replace('.bin', '')] = \
        map(lambda v: np.median(v), ds_nss_w2v_cos)


# w2v: min mean cos

for w2v_model_key in w2v_models.keys():
    print w2v_model_key
    
    w2v = w2v_models[w2v_model_key]

    Q_list_words = map(lambda s: 
                       np.array(map(lambda w: w2v[w] if w in w2v else np.zeros(w2v['foot'].shape), s.split())), 
                       df['qcnss'].values)
    T_list_words = map(lambda s: 
                       np.array(map(lambda w: w2v[w] if w in w2v else np.zeros(w2v['foot'].shape), s.split())), 
                       df['tcnss'].values)
    D_list_words = map(lambda s: 
                       np.array(map(lambda w: w2v[w] if w in w2v else np.zeros(w2v['foot'].shape), s.split())), 
                       df['dcnss'].values)

    df['min_mean_cos_qt_cnss_%s' % w2v_model_key.replace('.bin', '')] = \
        map(lambda m: m.min(axis=1).mean(), 
            map(lambda t: pairwise_distances(t[0], t[1], metric='cosine'), zip(Q_list_words, T_list_words)))

    df['min_mean_cos_qd_cnss_%s' % w2v_model_key.replace('.bin', '')] = \
        map(lambda m: m.min(axis=1).mean() if type(m) is not int else 0, 
            map(lambda t: pairwise_distances(t[0], t[1], metric='cosine') if np.prod(t[1].shape) > 0 else 0, 
                zip(Q_list_words, D_list_words)))
        

# w2v: max mean sim

def sent_sim_matrix(s1, s2, w2v):
    s1 = s1.split()
    s2 = s2.split()
    if len(s1) == 0 or len(s2) == 0:
        return 0
    m = np.zeros((len(s1), len(s2)))
    for i in range(len(s1)):
        for j in range(len(s2)):
            m[i, j] = w2v.similarity(s1[i], s2[j]) if s1[i] in w2v and s2[j] in w2v else 0
    return m

for w2v_model_key in w2v_models.keys():
    print w2v_model_key
    
    w2v = w2v_models[w2v_model_key]

    df['max_mean_sim_qt_cnss_%s' % w2v_model_key.replace('.bin', '')] = \
        map(lambda m: m.max(axis=1).mean(), 
            map(lambda t: sent_sim_matrix(t[0], t[1], w2v), zip(df['qcnss'].values, df['tcnss'].values)))
    
    df['max_mean_sim_qt_cnss_%s' % w2v_model_key.replace('.bin', '')] = \
        map(lambda m: m.max(axis=1).mean() if type(m) is not int else 0, 
            map(lambda t: sent_sim_matrix(t[0], t[1], w2v), zip(df['qcnss'].values, df['dcnss'].values)))