# Cluster Features Extracted from CQQ Dataset

In [None]:
import os
import sys
from os.path import join
sys.path.append(os.path.dirname(os.getcwd()))
from preprocessing.preprocess import process_features_df

import numpy as np
import scipy as sc
import pandas as pd
import sklearn
from sklearn.decomposition import PCA 
from sklearn.manifold import TSNE
# import umap

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from bokeh.plotting import figure, show, output_file, output_notebook, ColumnDataSource
from bokeh.models import PanTool, HoverTool, BoxSelectTool, ZoomInTool, ZoomOutTool, WheelZoomTool, SaveTool
from bokeh.io import reset_output
from bokeh.io.saving import save

from styles import *

In [None]:
experiment_data_directory = '../pilot_experiment_data/'
results_dir = '../results'
input_file_name = join(experiment_data_directory, 'dataset_features.tsv')
features_df = pd.read_csv(input_file_name, sep='\t')

In [None]:
features_df.groupby(['corpus']).size()

In [None]:
processed_df, fit_imputer, fit_scaler, included_cols, numeric_cols = process_features_df(
    features_df,
    id_fields=[ 'corpus', 'dataset_id', 'dataset_id_with_combination_number' ],
    winsor=99.999,
    scaler='standard',
    drop_duplicates=True,
    return_transforms=True,
    secondary_deduplication=None,
    verbose=False
)

In [None]:
processed_df.to_csv(join(results_dir, 'dataset_features_processed_winsor-99999_standard.tsv'), sep='\t', index=False)

In [None]:
min_per_group = np.min(processed_df.groupby(['corpus']).size())
processed_df = processed_df.groupby(['corpus'], as_index=False).apply(lambda x: x.sample(min_per_group))

In [None]:
processed_df.to_csv(join(results_dir, 'dataset_features_processed_winsor-99999_standard_balanced.tsv'), sep='\t', index=False)

In [None]:
processed_df_labels = processed_df[['corpus']]
processed_df.drop(['corpus', 'dataset_id', 'dataset_id_with_combination_number'], axis=1, errors='ignore', inplace=True)
processed_df.drop(['dataset_id', 'fid'], axis=1, errors='ignore', inplace=True)

In [None]:
processed_df_labels = processed_df_labels.reset_index()['corpus']

In [None]:
processed_df_labels

# Dimension Reduction

In [None]:
pca_model = PCA(n_components=0.85).fit(processed_df)
print(sum(pca_model.explained_variance_ratio_))
print(pca_model.explained_variance_ratio_.cumsum())
number_of_principal_components = np.argmax(pca_model.explained_variance_ratio_.cumsum() > 0.85)
print(number_of_principal_components)

In [None]:
pca_projection = pca_model.fit_transform(processed_df)
pca_df = pd.DataFrame(pca_projection)

In [None]:
def normalize(v):
    return (v - np.min(v)) / (np.max(v) - np.min(v))

In [None]:
pca_df_to_plot = pca_df
pca_df_to_plot['x'] = pca_df_to_plot[0]
pca_df_to_plot['y'] = pca_df_to_plot[1]

In [None]:
formatted_names = {
    'manyeyes': 'Many Eyes',
    'plotly': 'Plotly',
    'webtables': 'Web Tables',
    'opendata': 'Open Data',
}

In [None]:
for perplexity in [75]:
    name = 'tsne_perp-{}_pca-{}_nclass-{}'.format(perplexity, number_of_principal_components, min_per_group)
    t_sne = TSNE(n_components=2, perplexity=perplexity, verbose=1).fit_transform(pca_projection) 
    t_sne_df = pd.DataFrame(data=t_sne)
    try:
        t_sne_df['x'] = t_sne_df[0]
        t_sne_df['y'] = t_sne_df[1]
        t_sne_df.drop([0, 1], axis=1, inplace=True, errors='ignore')
    except: pass
    
    t_sne_df_with_datasets = pd.concat([ t_sne_df, processed_df_labels], axis=1)
    t_sne_df_with_datasets['color'] = t_sne_df_with_datasets['corpus'].map(colors)
    t_sne_df_with_datasets['corpus'] = t_sne_df_with_datasets['corpus'].map(formatted_names)

    fig, ax = plt.subplots(figsize=(3.5, 3.5))

    colors_dict = {
        'brown': '#9d755d',
        'pink': '#ff9da6',
        'green': '#54a24b',
        'blue': '#4c78a8',
        'orange': '#f58518',
        'red': '#e45756',
        'purple': '#b279a2',
        'teal': '#72b7b2'
    }

    colors = {
        'kim_and_heer': colors_dict['teal'],
        'manyeyes': colors_dict['red'],
        'plotly': colors_dict['purple'],
        'viznet': colors_dict['blue'],
        'webtables': colors_dict['green'],
        'opendata': colors_dict['orange']
    }

    corpuses = [ 'webtables', 'plotly', 'manyeyes', 'opendata' ]
    formatted_name_colors = { formatted_names[c]: colors[c] for c in corpuses}

    for corpus in corpuses:
        df = t_sne_df_with_datasets[t_sne_df_with_datasets['corpus'] == formatted_names[corpus]]
        df['color'] = df['corpus'].map(formatted_name_colors)

        df = df.sample(1000)

        plt.scatter(
            x=df['x'],
            y=df['y'],
            s=1,
            c=df['color'],
            label=formatted_names[corpus]
        )

    ax.legend(
        ncol=3,
        loc=9,
        bbox_to_anchor=(0.45, -0.05),
        frameon=False
    )

    plt.savefig(join(results_dir, '{}_checkpoint.png'.format(name)), format='png', pad_inches=0.0, bbox_inches='tight')
    plt.savefig(join(results_dir, '{}_checkpoint.svg'.format(name)), format='svg', pad_inches=0.0, bbox_inches='tight')
    plt.savefig(join(results_dir, '{}_checkpoint.pdf'.format(name)), format='pdf', pad_inches=0.0, bbox_inches='tight')
    plt.show()

    t_sne_df_with_datasets.to_csv(join(results_dir, '{}.csv'.format(name)), index=False)

In [None]:
corpuses = [ 'webtables', 'plotly', 'manyeyes', 'opendata' ]
formatted_name_colors = { formatted_names[c]: colors[c] for c in corpuses}

fig, ax = plt.subplots(figsize=(4, 4.25))
for corpus in corpuses:
    df = t_sne_df_with_datasets[t_sne_df_with_datasets['corpus'] == formatted_names[corpus]]
    df['color'] = df['corpus'].map(formatted_name_colors)

    df = df.sample(1000)

    plt.scatter(
        x=df['x'],
        y=df['y'],
        s=1,
        c=df['color'],
        label=formatted_names[corpus]
    )
    
    ax.legend(
        ncol=2,
        loc=9,
        bbox_to_anchor=(0.45, -0.10),
        frameon=False
    )
    
    
plt.savefig(join(results_dir, '{}_mini.svg'.format(name)), format='svg', pad_inches=0.0, bbox_inches='tight')
plt.savefig(join(results_dir, '{}_mini.pdf'.format(name)), format='pdf', pad_inches=0.0, bbox_inches='tight')
plt.show()
