In [1]:
from ipytables import *

In [2]:
import pylab as pl
%matplotlib inline
pl.rcParams['figure.figsize'] = 10, 5


In [3]:
import itertools
import os.path
import csv
import numpy as np
import scipy.spatial
import pandas as pd
pd.options.display.max_rows=20
pd.options.display.max_seq_items = 100
pd.options.display.max_colwidth = 100

In [4]:
#try:
#os.chdir("prepared_corpora/opinosis/")

#os.chdir("msrp")
os.chdir("prepared_corpora/msrp/")
#except FileNotFoundError:
#    pass

In [5]:
def load_pf_group_index():
    df = pd.read_csv("./phrase_groups.csv")
    df.set_index(["paraphrase_group_index", "phrase_index"], inplace=True)
    return df
    

In [6]:
df = load_pf_group_index()
df["tokenized_phrases"]=[line.strip() for line in open("tokenized_phrases.txt","r")]


In [7]:
df["pvdm"]=list(np.loadtxt("outVectors_wiki_sentence_concat_pvdm.csv", delimiter=","))
df["dbow"]=list(np.loadtxt("outVectors_wiki_sentence_dbow.csv", delimiter=","))
df["urae"]=list(np.loadtxt("outVectors_RAE2011.csv", delimiter=","))
df["mowe"]=list(np.loadtxt("outVectors_mowe.csv", delimiter=","))
#df["sowe"]=list(np.loadtxt("outVectors_sowe.csv", delimiter=","))
#df["howe"]=list(np.loadtxt("outVectors_howe.csv", delimiter=","))
df["bow"]=list(np.loadtxt("outVectors_bow.csv", delimiter=","))


In [8]:
#Just consider paraphrase groups of a certain size
df = df.groupby(level=0).filter(lambda group: len(group)>=3)

In [9]:
def get_true_classes(series):
    return np.asarray([ii[0] for ii in series.index])

In [10]:
from sklearn.decomposition import PCA
#df["pca_bow"] = list(PCA(300, whiten=False).fit_transform(np.row_stack(list(df.bow))))
#df["pca_howe"] = list(PCA(300, whiten=False).fit_transform(np.row_stack(list(df.howe))))

In [None]:
for base in "pvdm dbow urae".split():
    df[base+"_mowe"] = df[base].combine(df.mowe, func=lambda a,b: np.concatenate((a,b)))
#    df["pca_"+base+"_bow"] = list(PCA(300, whiten=False).fit_transform(np.row_stack(df[base+"_bow"])))

In [None]:
for base in "pvdm dbow urae".split():
    df[base+"_bow"] = df[base].combine(df.bow, func=lambda a,b: np.concatenate((a,b)))
#    df["pca_"+base+"_bow"] = list(PCA(300, whiten=False).fit_transform(np.row_stack(df[base+"_bow"])))


In [None]:
#Very Low Dimentionality:
for base in "pvdm dbow urae mowe bow".split():
    df[base] = list(PCA(32, whiten=False).fit_transform(np.row_stack(df[base])))


In [None]:
ax=df.tokenized_phrases.groupby(level=0).agg(len).hist(bins=35)

pl.title("Distribution of Paraphrase Groups in Subcorpus")
pl.xlabel("Number of Paraphrases")
pl.ylabel("Number of Groups")

#Wei's method

0. Normalise all embeddings for each model
1. within each paraphrase group, for each model, work out average pair-wise cosine similarity (or distance)
  1. the ground truth in this case should be horizontal x axis, which means the best values that each model gets should be close to zero (if you use similarity, rather than distance). So the model with an overall trend of low similarity values capture the semantics better.
2. treat each model's average similarity/distance for the paraphrase groups as a time series, do a z-normalisation () the time (horizontal) axis is the paraphrase group index, and the y axis is the similarity value after the z-normalisation
3. measure the similarity between the four model time series, pair-wise using Euclidean distance to see which model agrees with each other. 


In [None]:
import scipy.spatial.distance as distance

def angular_dist(a,b):
    cos_sim = np.dot(a,b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return 1- np.arccos(cos_sim)/np.pi


In [None]:
def get_std(series):
    xs=np.row_stack(series)
    std_dev = xs.std(axis=0)
    std_dev[std_dev==0.0]=1.0
    return list(std_dev)
def get_mean(series):
    xs = np.row_stack(series)
    return list(xs.mean(axis=0))

dfs = (df.iloc[:,1:]-df.iloc[:,1:].apply(get_mean))/df.iloc[:,1:].apply(get_std)



In [None]:
def get_group_sim(group):
    xs = np.row_stack(group)
    #cosine_sim = (1-distance.pdist(xs, 'cosine'))
    dist = distance.pdist(xs,angular_dist)
    return dist.mean()

group_sims = dfs.groupby(level=0).agg(get_group_sim)
group_sims.hist(bins=np.linspace(0,1,50), sharex=True, sharey=True)
group_sims.describe()

In [None]:
import matplotlib.cm as cm
def plot_matrix(cm, labels, cmap=cm.Blues):
    pl.imshow(cm, interpolation='nearest', cmap=cmap)
    pl.colorbar()
    tick_marks = np.arange(len(labels))
    pl.xticks(tick_marks, labels, rotation=90)
    pl.yticks(tick_marks, labels)
    pl.tight_layout()
    

In [None]:

def get_interrater_e_distance(ratings):
    rating_dist = distance.pdist(n_group_sims.as_matrix().T)
    return distance.squareform(rating_dist)

def z_norm(col):
    return (col-col.mean())/col.std()

n_group_sims = group_sims.apply(z_norm)


interrater_e_distance = get_interrater_e_distance(n_group_sims)
print(pd.DataFrame( interrater_e_distance, columns =n_group_sims.columns, index=n_group_sims.columns))
      
plot_matrix(interrater_e_distance, n_group_sims.columns)

In [None]:
def get_centroid(group):
    xs = np.row_stack(group)
    return list(xs.mean(axis=0)) #Euclidean



def get_medoid(group):
    points=np.row_stack(group)
    ingroup_index = distance.cdist(points,points, 'cosine').sum(axis=1).argmin()
    return list(points[ingroup_index,:])


def get_angular_centroid(group):
    # the angular centroid is a line, here we choes an arbitairy point on it.
    # Dicussion of how normalising and adding wroks is at http://math.stackexchange.com/a/925659/1505
    points = np.row_stack(group)
    unit_points = points/np.linalg.norm(points,axis=1)[:,None]  #Normalise ot unit vectors
    return list(np.sum(unit_points,axis=0))
    

def get_group_medoids(series):
    return series.groupby(level=0).agg(get_medoid)


def get_group_centroids(series):
    return series.groupby(level=0).agg(get_centroid)


In [None]:
def dunn_index(series):
    group_intra_distance = series.groupby(level=0).agg(get_group_sim)
    denom = group_intra_distance.max()
    group_ids = np.unique(series.index[0])
    
    for ii in group_ids:
        smallest_inter_dist = np.Inf
        for jj in group_ids:
            if ii==jj:
                continue
            #Now to find the Interclass distance
            #Lets
            Not WRITEN YET

    
    return denom

dunn_index(df.urae)

In [None]:

#Davies-Bouldin Index 
#Modified to use angular centroid, and angular distance

def davied_bouldin_index(series):

    def av_dist_to_centroid(group):
        xs = np.row_stack(group)
        c = get_angular_centroid(group)
        return distance.cdist(xs,[c], angular_dist).mean()

    group_centroids = get_group_centroids(series)
    group_spreads = series.groupby(level=0).agg(av_dist_to_centroid)
    
    n_groups = len(group_spreads)
    
    total = 0.0
    for ii in range(n_groups):
        greatest = -1*np.Inf
        for jj in range(n_groups):
            if ii==jj:
                continue

            numer = group_spreads.iloc[ii] + group_spreads.iloc[jj] 
            denom = angular_dist(group_centroids.iloc[ii],group_centroids.iloc[jj])
            value = numer/denom
            greatest = max(greatest, value)
        total+= greatest
    return total/n_groups

            
for column in df.columns[1:]:
    print(column + " " + str(davied_bouldin_index(df.loc[:,column])))

In [None]:
def get_classes_about_center(series, center_fun):
    group_centers = np.row_stack(series.groupby(level=0).agg(center_fun)) 
    points  = np.row_stack(series)
    class_indexes = distance.cdist(points,group_centers, 'cosine').argmin(axis=1)
    return np.asarray(series.index[class_indexes].labels[0])

def get_classes_by_similarity_to_true_classes(series):
    true_classes = get_true_classes(series)
    
    def av_dist_to_group(point, group):
        xs = np.row_stack(group)
        return distance.cdist(xs, [point], 'cosine').mean()
    
    return np.asarray([series.groupby(level=0).agg(lambda group: av_dist_to_group(point, group)).idxmin()
               for point in series])
    



def recluster_success(series):
    true_classes = get_true_classes(series)

    def get_acc(center_fun):
        classes = get_classes_about_center(series, center_fun)
        return (classes == true_classes).sum()/len(true_classes)
    
    classes = get_classes_by_similarity_to_true_classes(series)
    group_closeness_acc = (classes == true_classes).sum()/len(true_classes)

    return pd.Series([get_acc(get_centroid),
                      get_acc(get_medoid),
                      get_acc(get_angular_centroid),
                      group_closeness_acc
                     ], index="centroid_acc medoid_acc angular_centroid_acc group_closeness_acc".split()) 
    

recluster_results = pd.DataFrame()
for column in df.columns[1:]:
    recluster_results[column] = recluster_success(df[column])

recluster_results

In [None]:
#Lets take a look at what mistakes are being made.
#We can look at the exemplary member of each class as a repressentitive,
#And can compare that for the class it was given to that for the class it should have been given

def get_exemplars(df, col_name, get_center_fun):
    series = df[col_name]
    def get_exemplar(group):
        c = get_center_fun(group)
        points=np.row_stack(group)
        ingroup_index = distance.cdist(points,[c], 'cosine').flatten().argmin()
        df_index = group.index[ingroup_index]
        return df.tokenized_phrases[df_index]
    
    return series.groupby(level=0).agg(get_exemplar)


def get_mistakes(df, col_name, get_center_fun=get_angular_centroid):
    group_exemplars = get_exemplars(df, col_name,get_center_fun)
    
    series = df[col_name]
    
    classes = get_classes(series, get_center_fun)
    
    return show_mistakes(df, classes, col_name, get_center_fun)


def show_mistakes(df, actual_classes, col_for_exemplar="bow", get_center_fun = get_angular_centroid):
    group_exemplars = get_exemplars(df, col_for_exemplar, get_center_fun)
    
    true_classes = get_true_classes(df)
    mistake_indexs = np.nonzero(actual_classes!=true_classes)
    
    ret = pd.DataFrame()
    ret["mistaken_texts"] = list(df.tokenized_phrases.iloc[mistake_indexs])
    ret["true_exemplar"] = list(group_exemplars[true_classes[mistake_indexs]])
    ret["actual_exemplar"] = list(group_exemplars[actual_classes[mistake_indexs]])
    
    return ret


In [None]:
#show_mistakes(df, get_classes_by_similarity_to_true_classes(df.bow))

In [None]:
#get_mistakes(df, "bow",get_angular_centroid)

In [11]:
from sklearn.svm import NuSVC, SVC, LinearSVC
from sklearn import tree
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier

#class_weight = {key: 1/value for (key, value) in Counter(training_data.debate_type).items()}

def classify(column, classifier, train_data, test_data, train_labels, test_labels):
    train_data_X = np.row_stack(train_data[column])
    test_data_X = np.row_stack(test_data[column])

    classifier.fit(train_data_X, train_labels)
    
    return classifier.score(test_data_X, test_labels)


classifiers = [LinearSVC(class_weight="auto",  max_iter=20000, C=0.1),
               LinearSVC(class_weight="auto",  max_iter=20000), 
               LinearSVC(class_weight="auto",  max_iter=20000, C=5),
               LinearSVC(class_weight="auto",  max_iter=20000, C=10),
               LinearSVC(class_weight="auto",  max_iter=20000, C=100),
               LinearSVC(class_weight="auto",  max_iter=20000, C=1000),
               LinearSVC(class_weight="auto",  max_iter=20000, C=10000)
       #SVC(class_weight="auto",  max_iter=20000, kernel='rbf', C=10),
       #SVC(class_weight="auto",  max_iter=20000, kernel='rbf', C=100),        
       #SVC(class_weight="auto",  max_iter=20000, kernel='rbf', C=1000),
       #SVC(class_weight="auto",  max_iter=20000, kernel='rbf', C=10000),
       #SVC(class_weight="auto",  max_iter=20000, kernel='rbf', C=100000),
       #SVC(class_weight="auto",  max_iter=20000, kernel='rbf', C=1000000),
        #OneVsRestClassifier(SVC(class_weight="auto",  max_iter=20000, kernel='rbf')),
        #tree.DecisionTreeClassifier(class_weight="auto")
        
        
       ]


In [12]:
from sklearn import cross_validation
n_folds = df.tokenized_phrases.groupby(level=0).agg(len).min()
print(str(n_folds)+"-fold validation")
classify_results = pd.DataFrame(index=classifiers)
for column in df.columns[1:]:
    for classifier in classifiers:
        cv_res = cross_validation.cross_val_score(classifier, 
                                          X = np.row_stack(df[column]),
                                          y = get_true_classes(df),
                                          cv=df[column].groupby(level=0).agg(len).min(),
                                          n_jobs=-1
                                        
                                          )
        classify_results.loc[classifier,(column)] = cv_res.mean()
        #classify_results.loc[svm,(column+"_std")] = cv_res.std()

classify_results

3-fold validation


Unnamed: 0,pvdm,dbow,urae,mowe,bow
"LinearSVC(C=0.1, class_weight='auto', dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=20000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0)",0.325217,0.880221,0.440641,0.962935,0.983719
"LinearSVC(C=1.0, class_weight='auto', dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=20000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0)",0.603837,0.899325,0.500057,0.974429,0.983719
"LinearSVC(C=5, class_weight='auto', dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=20000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0)",0.738386,0.892589,0.511377,0.977918,0.983719
"LinearSVC(C=10, class_weight='auto', dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=20000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0)",0.774329,0.88787,0.508836,0.979134,0.983719
"LinearSVC(C=100, class_weight='auto', dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=20000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0)",0.779969,0.88545,0.495991,0.977918,0.983719
"LinearSVC(C=1000, class_weight='auto', dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=20000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0)",0.74154,0.88545,0.487757,0.977918,0.983719
"LinearSVC(C=10000, class_weight='auto', dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=20000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0)",0.734536,0.884247,0.487757,0.977918,0.983719


In [13]:
#The corr
from copy import deepcopy
from sklearn.cross_validation import StratifiedKFold
X = np.row_stack(df["bow"])
y = get_true_classes(df)
n_folds = df.tokenized_phrases.groupby(level=0).agg(len).min()

pca_bow_res = pd.DataFrame()
for ii_fold, (test_indexes, train_indexes) in enumerate(StratifiedKFold( y, n_folds)):
    X_train = X[train_indexes]
    y_train = y[train_indexes]
    X_test = X[test_indexes]
    y_test = y[test_indexes]
    
    pca = PCA(300)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)
    
    
    for classifier_index in classifiers:
        classifier = deepcopy(classifier_index)
        classifier.fit(X_train, y_train)
        pca_bow_res.loc[classifier_index,(ii_fold)] = classifier.score(X_test, y_test)
    
    
pca_bow_res



Unnamed: 0,0,1,2
"LinearSVC(C=0.1, class_weight='auto', dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=20000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0)",0.978221,0.975945,0.984615
"LinearSVC(C=1.0, class_weight='auto', dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=20000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0)",0.976407,0.975945,0.982906
"LinearSVC(C=5, class_weight='auto', dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=20000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0)",0.976407,0.975945,0.982906
"LinearSVC(C=10, class_weight='auto', dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=20000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0)",0.976407,0.975945,0.982906
"LinearSVC(C=100, class_weight='auto', dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=20000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0)",0.976407,0.975945,0.982906
"LinearSVC(C=1000, class_weight='auto', dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=20000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0)",0.976407,0.975945,0.982906
"LinearSVC(C=10000, class_weight='auto', dual=True, fit_intercept=True,\n intercept_scaling=1, loss='squared_hinge', max_iter=20000,\n multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,\n verbose=0)",0.976407,0.975945,0.982906


In [14]:
classify_results["pca_bow"] = pca_bow_res.mean(axis=1)

pd.options.display.float_format = lambda x: '{:.2f}%'.format(x*100)
classify_results.max()

pvdm      78.00%
dbow      89.93%
urae      51.14%
mowe      97.91%
bow       98.37%
pca_bow   97.96%
dtype: float64

In [15]:
from sklearn.cross_validation import StratifiedShuffleSplit

best_classifiers = classify_results.idxmax().to_dict()
actual_classes=dict()

y=get_true_classes(df)
train_indexes, test_indexes = list(*StratifiedShuffleSplit(y,1,test_size=1/n_folds))
y_train = y[train_indexes]
y_test = y[test_indexes]

for model in best_classifiers.keys():
    X = np.row_stack(df[model])
    X_train = X[train_indexes]
    X_test = X[test_indexes]
    
    if model == "pca_bow":
        pca = PCA(300)
        X_train = pca.fit_transform(X_train)
        X_test = pca.transform(X_test)
    
    classifier = best_classifiers[model]
    classifier.fit(X_train,y_train)
    actual_classes[model] = classifier.predict(X_test)
actual_classes["ground_truth"] = y_test

KeyError: 'pca_bow'

In [None]:
import sklearn
agreement = pd.DataFrame(columns=actual_classes.keys(), index=actual_classes.keys())
for model1 in actual_classes.keys():
    for model2 in actual_classes.keys():
        agreement.loc[model1,model2] = sklearn.metrics.accuracy_score(actual_classes[model1],actual_classes[model2])
plot_matrix(np.asarray( agreement.values, dtype=np.float32), agreement.columns)

agreement



In [None]:
actual_classes2 = deepcopy(actual_classes)
del actual_classes2['ground_truth']

mistake_agreement = pd.DataFrame(columns=actual_classes2.keys(), index=actual_classes2.keys())


for model1 in actual_classes2.keys():
    for model2 in actual_classes2.keys():
        classes1=actual_classes2[model1]
        classes2=actual_classes2[model2]
        
        mistake_indexes = np.logical_and(classes1!=y_test, classes2!=y_test)
        
        
        mistake_agreement.loc[model1,model2] = sklearn.metrics.jaccard_similarity_score(classes1[mistake_indexes],classes2[mistake_indexes])
plot_matrix(np.asarray( mistake_agreement.values, dtype=np.float32), mistake_agreement.columns)
mistake_agreement

        

In [None]:
difficulty_agreement = pd.DataFrame(columns=actual_classes2.keys(), index=actual_classes2.keys())

for model1 in actual_classes2.keys():
    for model2 in actual_classes2.keys():
        classes1=actual_classes2[model1]
        classes2=actual_classes2[model2]
        
        difficulty_agreement.loc[model1,model2] = sklearn.metrics.jaccard_similarity_score(classes1!=y_test,
                                                                                           classes2!=y_test)
        
plot_matrix(np.asarray( difficulty_agreement.values, dtype=np.float32), difficulty_agreement.columns)
difficulty_agreement

        

In [None]:
def get_symetry_measure(group):
    xs = np.row_stack(group)
    dist=distance.pdist(xs, angular_dist)
    c=get_angular_centroid(xs)
    dist = distance.cdist(xs, [c],angular_dist)
    return dist.std()

df.groupby(level=0).agg(get_symetry_measure).describe()
    

In [16]:
def portion_of_points_on_bounding_box(group):
    xs = np.row_stack(group)
    on_max_border = np.any(xs==xs.max(axis=0), axis=1)
    on_min_border = np.any(xs==xs.min(axis=0), axis=1)
    on_border = on_min_border + on_max_border
    return ( on_border.sum()/len(group))

def all_points_on_bounding_box(group):
    return portion_of_points_on_bounding_box(group)==1

In [17]:
def unit_norms(vectors):
    vecs = np.row_stack(vectors)
    unit_norms = np.linalg.norm(vecs,axis=1)
    return unit_norms#[list(row) for row in unit_vecs]

dfa = pd.DataFrame(index=df.index, columns="bow pvdm dbow mowe urae".split())
for col in dfa.columns:
    dfa.loc[:,col]=df.loc[:,col]/unit_norms(df.loc[:,col])

dfa


Unnamed: 0_level_0,Unnamed: 1_level_0,bow,pvdm,dbow,mowe,urae
paraphrase_group_index,phrase_index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,0,"[0.179605302027, 0.359210604054, 0.179605302027, 0.179605302027, 0.359210604054, 0.179605302027,...","[-0.0117090570115, -0.0587189545721, 0.0244417123033, -0.0267791108917, 0.0385295719216, 0.01680...","[-0.0729846946183, -0.0371926100352, 0.142803608845, 0.0368007142406, -0.0308977245611, 0.073924...","[0.0305474746721, 0.0311183031484, -0.0284627311194, 0.0130811284067, -0.0287902496268, -0.00041...","[0.0379956254845, 0.0790094720625, -0.00654024314252, 0.11341040047, 0.0620675305939, 0.07377024..."
0,1,"[0.0, 0.38490017946, 0.19245008973, 0.19245008973, 0.38490017946, 0.19245008973, 0.0, 0.19245008...","[0.0081831543312, -0.0775741948289, 0.0367460588892, -0.0611784850532, 0.00540528108917, 0.00754...","[0.00828551612384, -0.0276336841116, 0.122585907853, 0.0175703064057, 0.00649675533426, 0.035090...","[0.0138357716552, 0.0256786693164, -0.0329862232433, 0.000318545484942, 0.00968210618705, -0.009...","[-0.0497647871269, 0.214501634203, -0.079424998831, -0.0239483279593, -0.0367859435957, 0.142742..."
0,2,"[0.0, 0.208514414057, 0.208514414057, 0.208514414057, 0.208514414057, 0.208514414057, 0.0, 0.208...","[0.086398914955, 0.0155975029885, 0.0654881347286, -0.0919812137162, -0.0527465792094, -0.050030...","[-0.064749125469, -0.048569826136, 0.0724991165727, 0.00831948023937, -0.0386237513551, 0.096740...","[-0.00554730961616, 0.0259517988153, 0.000677861499887, 0.0230901936227, -0.022341542789, -0.036...","[-0.0583010504666, 0.0206015241569, -0.0756345218889, 0.261240525625, -0.0218611625333, 0.054668..."
1,3,"[0.0, 0.458831467741, 0.229415733871, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[-0.121062433155, -0.000899878411466, 0.0189303737617, -0.00279241559814, 0.0327467876893, 0.058...","[-0.032022571248, 0.0178693451484, -0.143285270861, -0.00640923969129, 0.0354890426553, -0.00152...","[0.0219278414138, -0.0461185426273, 0.0770783577792, 0.0138788734381, 0.0243902221851, -0.088196...","[0.059003722651, 0.00952455641027, 0.143333238249, -0.0420982081774, 0.0678742459352, -0.0231624..."
1,4,"[0.0, 0.648885684523, 0.162221421131, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[-0.0740762461443, 0.0224820292554, 0.023692226038, -0.0124384560245, 0.0267403049967, 0.0174603...","[-0.0032536445765, 0.0432460555735, -0.122386167617, 0.0347117495019, 0.0573146862673, 0.0516572...","[0.0179497332348, -0.00720630816923, 0.0293081296161, 0.0409595660203, 0.0389701957239, -0.07752...","[0.0421205602652, -0.118670187704, -0.0954316047985, 0.0679992624783, 0.00911875454205, 0.153276..."
1,5,"[0.0, 0.436435780472, 0.218217890236, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[-0.0945685676714, 0.0206731991033, -0.00577246427113, 0.0163400202958, 0.0634217515894, 0.02555...","[0.0743759788521, -0.0233479555975, -0.0217173619408, 0.00385161193873, 0.0466486585942, 0.10551...","[-0.0248494477954, -0.03852036406, 0.0493454977553, 0.0149217586032, 0.0444351278317, -0.0762582...","[-0.00720716787582, -0.112276734631, -0.0115876482772, 0.0547049594085, 0.0138767340211, 0.07095..."
2,6,"[0.0, 0.22360679775, 0.22360679775, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.223...","[0.175825715121, 0.0485970052697, -0.0387346283781, 0.0668004239572, -0.0231944394456, -0.102192...","[0.0071169634826, -0.0217367530261, -0.0170878568513, 0.0758748079597, 0.0330822371572, -0.01732...","[0.0943817966211, 0.0322850549455, 0.0702733653709, 0.0314974460724, 0.0372525179256, 0.01573835...","[-0.01098094329, 0.0466796989899, -0.0691152996234, -0.0285033326782, -0.121094726989, 0.1071012..."
2,7,"[0.0, 0.229415733871, 0.229415733871, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.2...","[0.0882271436886, 0.0947497045307, 0.0272477988427, 0.0571814763876, -0.0264408673123, -0.031708...","[0.0380425686153, 0.00529066363126, -0.0638539957007, 0.0605010727352, 0.0289706930328, -0.01123...","[0.113787529105, 0.0145045572005, 0.0624645048711, 0.0332112407402, 0.0425852148816, -0.01331475...","[-0.0243169430039, 0.0502254637008, -0.0454194901044, 0.0353389351557, -0.133422337968, 0.062285..."
2,8,"[0.0, 0.392232270276, 0.196116135138, 0.0, 0.196116135138, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0....","[0.140726604724, 0.0857410214174, -0.0219621090301, -0.0275808728381, -0.0170008813284, -0.13227...","[-0.0593241095074, -0.0226074911107, -0.0216179831732, 0.0713470231245, 0.0180162314564, -0.0975...","[0.105846787081, 0.0400474022933, 0.0878998873338, 0.0281827538889, 0.0435480679583, -0.01630923...","[-0.0999643805467, 0.0136198191044, -0.0449082726924, 0.0555407477567, -0.137593750972, 0.087099..."
3,9,"[0.0, 0.0, 0.179605302027, 0.0, 0.179605302027, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0...","[0.0559544523236, -0.0903036081366, 0.0403970254183, -0.0668524841981, -0.0492208148528, 0.06836...","[0.00663721835037, -0.0186035081668, 0.0682539919807, 0.0326185156584, 0.0636796112851, 0.103125...","[0.00263355318464, 0.0774220268361, 0.0260756625791, 0.073146860475, -0.0464785692009, -0.070716...","[0.0421931516751, -0.0121079501706, 0.00949231701355, 0.0575152798311, 0.171293773533, -0.013089..."


In [20]:
pd.options.display.float_format=None
df.groupby(level=0).agg(all_points_on_bounding_box).sum()/len(df.groupby(level=0))

pvdm    1
dbow    1
urae    1
mowe    1
bow     1
dtype: float64

274

In [28]:
pd.options.display.float_format = lambda x: '{:.2f}%'.format(x*100)
hollowness=pd.Series()
hollowness["pvdm"]=portion_of_points_on_bounding_box(df.pvdm)
hollowness["dbow"]=portion_of_points_on_bounding_box(df.dbow)
hollowness["urae"]=portion_of_points_on_bounding_box(df.urae)
hollowness["mowe"]=portion_of_points_on_bounding_box(df.mowe)
hollowness["bow"]=portion_of_points_on_bounding_box(df.bow)
pd.DataFrame(hollowness)

Unnamed: 0,0
pvdm,17.58%
dbow,33.76%
urae,18.16%
mowe,25.03%
bow,100.00%


In [30]:
521/len(df)

0.6065192083818394

# 

In [None]:
def portion_where_angular_centroid_closer_than_average(group):
    c = get_angular_centroid(group)
    xs = np.row_stack(group)
    distances = distance.pdist(xs, 'cosine')
    av_dist = distances.mean()
    c_dist = distance.cdist(xs, [c],'cosine')
    return (c_dist<av_dist).sum()/len(group)
    
df.groupby(level=0).agg(portion_where_angular_centroid_closer_than_average).hist()

In [None]:
group = [[1,0],[0,1]]
c = get_angular_centroid(group)
xs = np.row_stack(group)

In [None]:
distance.pdist(xs, angular_dist)

In [None]:
distance.cdist(xs, [c],'cosine')

In [None]:
len(df.bow[(1,2)])

In [None]:
dfv = df.groupby(level=0).filter(lambda x: len(x) >22) #20)
from sklearn.cross_validation import StratifiedShuffleSplit
true_classes = get_true_classes(dfv)
train_index, test_index = list(*StratifiedShuffleSplit(true_classes, 1, test_size=1/3))



In [None]:
xs = np.row_stack(dfv.mowe)
classifier = classifiers[1]
classifier.fit(xs[train_index,:], true_classes[train_index] )
classifier.score(xs[test_index,:], true_classes[test_index])

In [None]:

from sklearn.utils import safe_mask
ndims=2
importances = np.abs(classifier.coef_).sum(axis=0)
key_dims = np.argpartition(-importances,ndims)[0:ndims]
threshold = importances[key_dims[-1]]
raw_mask = importances>=threshold
key_dims

In [None]:
from matplotlib import colors as mpl_colors
import random
color_names = [
    'aqua','black','blue','brown','cadetblue','chartreuse','chocolate','coral','crimson','cyan','darkblue','darkcyan','darkgray','darkgreen','darkgrey','darkkhaki','darkolivegreen','darkorange','darkred','darksage','darksalmon','darkseagreen','darkslateblue','darkturquoise','darkviolet','deeppink','deepskyblue','dimgrey','dodgerblue','firebrick','forestgreen','fuchsia','gold','goldenrod','green','greenyellow','grey','hotpink','indigo','lawngreen','lime','limegreen','magenta','maroon','midnightblue','olive','orange','orangered','orchid','peru','pink','plum','purple','red','rosybrown','royalblue','salmon','sandybrown','sienna','silver', 'steelblue','tan','thistle','tomato','violet','wheat','yellow','yellowgreen',
    ]
random.shuffle(color_names)

lbls = true_classes

color_dict = dict(zip(np.unique(lbls), color_names))
colors = [color_dict[lbl] for lbl in lbls]
color_dict

In [None]:
from sklearn.manifold import t_sne
def plot_tsne(series):
    tsne = t_sne.TSNE()
    Ys = tsne.fit_transform(np.row_stack(series))
    
    pl.scatter(Ys[:,0],Ys[:,1], c=colors)
    
plot_tsne(dfv.bow)

In [None]:
w = classifier.coef_[0]
a = -w[key_dims[0]] / w[key_dims[1]]
xx = np.linspace(xs.min(), xs.max())
yy = a * xx - (classifier.intercept_[0])/w[key_dims[1]]

In [None]:
#pl.plot(xx, yy, 'k-')
pl.scatter(x=xs[:,key_dims[0]],y=xs[:,key_dims[1]], c=colors)

In [None]:
print(__doc__)
# Code source: Gaël Varoquaux
# Modified for documentation by Jaques Grobler
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as pl
from sklearn import svm

# we create 40 separable points
np.random.seed(0)
X=xs[:,[key_dims[0],key_dims[1]]]
Y=true_classes
# figure number
fignum = 1

# fit the model
for name, penalty in (('unreg', 1), ('reg', 0.05)):

    clf = svm.LinearSVC(C=penalty)
    clf.fit(X, Y)

    # get the separating hyperplane
    w = clf.coef_[0]
    a = -w[0] / w[1]
    xx = np.linspace(-5, 5)
    yy = a * xx - (clf.intercept_[0]) / w[1]

    # plot the parallels to the separating hyperplane that pass through the
    # support vectors
    margin = 1 / np.sqrt(np.sum(clf.coef_ ** 2))
    yy_down = yy + a * margin
    yy_up = yy - a * margin

    # plot the line, the points, and the nearest vectors to the plane
    pl.figure(fignum, figsize=(4, 3))
    pl.clf()
    pl.plot(xx, yy, 'k-')
    pl.plot(xx, yy_down, 'k--')
    pl.plot(xx, yy_up, 'k--')

    pl.scatter(X[:, 0], X[:, 1], c=Y, zorder=10, cmap=pl.cm.Paired)

    pl.axis('tight')
    x_min = X[:,0].min()-np.std(X[:,0])
    y_min = X[:,1].min()-np.std(X[:,1])
    x_max = X[:,0].max()+np.std(X[:,0])
    y_max = X[:,1].max()+np.std(X[:,1])
    
    
    XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]
    Z = clf.predict(np.c_[XX.ravel(), YY.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(XX.shape)
    pl.figure(fignum, figsize=(4, 3))
    pl.pcolormesh(XX, YY, Z, cmap=pl.cm.Paired)

    pl.xlim(x_min, x_max)
    pl.ylim(y_min, y_max)

    pl.xticks(())
    pl.yticks(())
    fignum = fignum + 1

pl.show()

In [None]:
!git commit -a -m="Corrected calculation of PCA fits"