In [143]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression

from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from scipy.sparse import hstack

import time

import matplotlib.pyplot as plt
import matplotlib.colors as colors   # allows use of gamma to tune cmaps

from mpl_toolkits.mplot3d import Axes3D
import matplotlib.gridspec as gridspec
%matplotlib inline

# import plotly.offline as py
# import plotly.graph_objs as go
# py.init_notebook_mode(connected=True)

# import holoviews as hv
# hv.notebook_extension('plotly')

# import plotly

# plotly.tools.set_credentials_file(username='pixelatedbrian', api_key='GbiE5PkSLPv9HjnrU7E8')

#### Load data and light processing

In [2]:
train = pd.read_csv('../data/train.csv').fillna(' ')
test = pd.read_csv('../data/test.csv').fillna(' ')

train_text = train['comment_text']
test_text = test['comment_text']
all_text = pd.concat([train_text, test_text])

#### Vectorize words from both corpuses (corpi?)

In [10]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    stop_words='english',
    ngram_range=(1, 1),
    max_features=25000)    # 10k was initial

word_vectorizer.fit(all_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=25000, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents='unicode', sublinear_tf=True,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=True,
        vocabulary=None)

In [11]:
train_word_features = word_vectorizer.transform(train_text)
test_word_features = word_vectorizer.transform(test_text)

print("train shape:", train_word_features.shape)
print("test shape:", test_word_features.shape)

train shape: (159571, 25000)
test shape: (153164, 25000)


In [12]:
proto = train.iloc[:100, :]

rage = train.loc[np.sum(train.iloc[:,2:], axis=1) >= 1]

# this list of messages is much bigger, and also get unflagged ones, not just random ones
calm = train.loc[np.sum(train.iloc[:,2:], axis=1) == 0]


print(calm.shape)
boink = np.random.permutation(10000)

# boink = boink[:5000]

print(boink.shape)
print(boink[:10])

calm = pd.DataFrame(np.take(calm.values, boink[:5000], axis=0), columns=rage.columns)
rage = pd.DataFrame(np.take(rage.values, boink, axis=0), columns=calm.columns)

print(calm.shape)

(143346, 8)
(10000,)
[3231 4769 5085 8203 3333 5850 8748 5249 3098 6608]
(5000, 8)


In [13]:
rage.info()

print(rage.columns)
calm.info()

calm.keys = rage.keys

rage_vec_words = word_vectorizer.transform(rage.loc[:, "comment_text"])
calm_vec_words = word_vectorizer.transform(calm.loc[:, "comment_text"])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 8 columns):
id               10000 non-null object
comment_text     10000 non-null object
toxic            10000 non-null object
severe_toxic     10000 non-null object
obscene          10000 non-null object
threat           10000 non-null object
insult           10000 non-null object
identity_hate    10000 non-null object
dtypes: object(8)
memory usage: 625.1+ KB
Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 8 columns):
id               5000 non-null object
comment_text     5000 non-null object
toxic            5000 non-null object
severe_toxic     5000 non-null object
obscene          5000 non-null object
threat           5000 non-null object
insult           5000 non-null object
identity_hate    5000 non-null obje

In [14]:
rager = rage_vec_words.todense()
calms = calm_vec_words.todense()

calm_rage = np.vstack((calms, rager))

print(calm_rage.shape)
# pca_snip = calm_rage.todense()
pca = PCA(n_components=250)
pca_result = pca.fit_transform(calm_rage)

(15000, 25000)


In [15]:
np.sum(pca.explained_variance_ratio_)

0.29111845303576656

In [16]:
tsne = TSNE(n_components=3, verbose=1, perplexity=40, n_iter=300)
tsne_results = tsne.fit_transform(pca_result)

[t-SNE] Computing 121 nearest neighbors...
[t-SNE] Indexed 15000 samples in 0.203s...
[t-SNE] Computed neighbors for 15000 samples in 95.662s...
[t-SNE] Computed conditional probabilities for sample 1000 / 15000
[t-SNE] Computed conditional probabilities for sample 2000 / 15000
[t-SNE] Computed conditional probabilities for sample 3000 / 15000
[t-SNE] Computed conditional probabilities for sample 4000 / 15000
[t-SNE] Computed conditional probabilities for sample 5000 / 15000
[t-SNE] Computed conditional probabilities for sample 6000 / 15000
[t-SNE] Computed conditional probabilities for sample 7000 / 15000
[t-SNE] Computed conditional probabilities for sample 8000 / 15000
[t-SNE] Computed conditional probabilities for sample 9000 / 15000
[t-SNE] Computed conditional probabilities for sample 10000 / 15000
[t-SNE] Computed conditional probabilities for sample 11000 / 15000
[t-SNE] Computed conditional probabilities for sample 12000 / 15000
[t-SNE] Computed conditional probabilities for s

In [17]:
x_dim = tsne_results[:, 0]
y_dim = tsne_results[:, 1]
z_dim = tsne_results[:, 2]

In [18]:
calm_x = x_dim[10000:-1]
calm_y = y_dim[10000:-1]
calm_z = z_dim[10000:-1]

rage_x = x_dim[:10000]
rage_y = y_dim[:10000]
rage_z = z_dim[:10000]

In [20]:
rage_x.shape

(10000,)

In [45]:
##################################3
### For non-toxic class     #######
###################################

for aaa, ang in enumerate(range(0, 360, 1)):
    fig = plt.figure(figsize=(20, 20))
    ax = fig.add_subplot(111, projection='3d')

#     fig, ax = plt.subplots(figsize=(10,10))
    
    COUNT = 5000
    
    ax.scatter(calm_x[:COUNT], calm_y[:COUNT], calm_z[:COUNT],
               zdir='z',
               cmap="viridis",
               c=calm_z[:COUNT], s=200, label="Non-Toxic", alpha=0.3)
    
#     for idx in range(2, 8):
#         idy = idx - 2
#         if idy != 0 and idy != 4:
#         #     # for col in colors:
#         #     temp = indices[labels[:,idx]==1]
#         #     temp = temp[:3000]

#             ax.scatter(
#                     rage_x[rage.iloc[:,idx]==1][:counts[idy]],
#                     rage_y[rage.iloc[:,idx]==1][:counts[idy]],
#                     rage_z[rage.iloc[:,idx]==1][:counts[idy]],
#                     zdir='z',
#                     color=colors[idy],
#                     label=class_names[idy],
#                     s=20,
#                     marker="o",
#                     alpha=0.1)
#             ax.set_xticks([])
#             ax.set_yticks([])
#             ax.set_zticks([])
    ax.set_xlim(-3.0, 3.0)
    ax.set_ylim(-4, 3)
    ax.set_zlim(-3, 3)
    ax.w_xaxis.set_pane_color((0.98, 0.98, 0.98, 1.0))
    ax.w_yaxis.set_pane_color((0.98, 0.98, 0.98, 1.0))
    ax.w_zaxis.set_pane_color((0.98, 0.98, 0.98, 1.0))
    ax.set_axis_off()
    ax.autoscale_view(tight=True)

    ax.legend(fontsize=32, loc=1)
    ax.view_init(15 + 15 * np.sin(ang * np.pi / 180), ang)
    
#     fig = plt.gcf()
#     py.plot_mpl(fig, filename="mpl-colormaps-simple")
    
    filename = "../imgs/{:03d}.png".format(aaa)
    plt.savefig(filename, dpi=100)
    plt.gca()
    plt.clf()
    plt.close()

In [53]:
rage[rage.loc[:,'toxic']==1].head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,527218d0d2230e7a,FUCK YOU FUCK YOU FUCK YOU FUCK YOU FUCK YOU,1,0,1,0,1,0
1,7a9ed4958bdf833a,", who blantently privledge shit over quality",1,0,0,0,0,0
2,83a45c3f8fda2e5a,"""\n\n Hey...... I did NOT attack other wikiped...",1,0,0,0,0,0
3,d5aca4e1da09a8e7,"""\n\nIT IS TEH SAME DANCE. IT IS JUST BECAUSE ...",1,0,0,0,0,0
4,55658093e3ad479c,"Welcome, Korean assh*le's page.\nWhy don't you...",1,0,0,0,0,0


In [169]:
###################################
### For toxic type classes  #######
###################################

for aaa, ang in enumerate(range(0, 360, 1)):
    
    fig = plt.figure(figsize=(20, 10))
    gs = gridspec.GridSpec(3, 4)  # allow the merging of plots
#     gs.update(left=-0.5, right=0.05, wspace=0.0, hspace=0.0)
    ax = plt.subplot(gs[:,0:2], projection='3d')
    

#     fig, ax = plt.subplots(figsize=(10,10))
    
    COUNT = 5000
    
    ax.scatter(calm_x[:COUNT], calm_y[:COUNT], calm_z[:COUNT],
               zdir='z',
               cmap="viridis",
               c=calm_z[:COUNT], s=75, label="Non-Toxic", alpha=1.0)
    
    ax.set_xlim(-3.0, 3.0)
    ax.set_ylim(-4, 3)
    ax.set_zlim(-3, 3)
    ax.w_xaxis.set_pane_color((0.98, 0.98, 0.98, 1.0))
    ax.w_yaxis.set_pane_color((0.98, 0.98, 0.98, 1.0))
    ax.w_zaxis.set_pane_color((0.98, 0.98, 0.98, 1.0))
    
#     ax.set_facecolor("black")
    
    ax.set_axis_off()
    ax.autoscale_view(tight=True)
    ax.view_init(15 + 15 * np.sin(ang * np.pi / 180), ang)
    ax.set_title("Non-Toxic Comment")
    
        # help organize different characteristics
    class_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
    
    for class_num, _class in enumerate(class_names):
        _colors = ["Greens_r", "summer", "spring", "afmhot", "copper", "cool"]

        sizes = [20, 40, 20, 40, 20, 40]
        alphas = [0.75, 0.9, 0.75, 0.75, 0.75, 0.9]
        markers = ["o", "o", "^", "X", "o", "^"]
        counts = [20, 40, 20, 40, 20, 40]


        COUNT = 5000
        
        ################################
        ### Subplot maneuvering  #######
        ################################        
        
        # could do an alg or could be like hecka lazy
        plt_rows = [0, 0, 1, 1, 2, 2]
        plt_cols = [2, 3, 2, 3, 2, 3]
        
        ax = plt.subplot(gs[plt_rows[class_num], plt_cols[class_num]], projection='3d')


        idx = class_num
        idy = idx + 2

        sampler = np.random.permutation(COUNT)
        class_matches = rage.iloc[:, idy]==1   # big list of booleans to filter 8 columns to the topic wanted

        temp_x = rage_x[rage.iloc[:, idy]==1]
        temp_y = rage_y[rage.iloc[:, idy]==1]
        temp_z = rage_z[rage.iloc[:, idy]==1]
    #     print("shape of temp_x", temp_x.shape)

        # if the size of the matches is greater than the count
        # then subsample using the sampler
        if temp_x.shape[0] > COUNT:
            temp_x = np.take(temp_x, sampler)
            temp_y = np.take(temp_y, sampler)
            temp_z = np.take(temp_z, sampler)


        v_offset = np.min(temp_z)

        c_off = 1.0
        _c = (temp_z - v_offset)/2 + c_off

    #     print("max c", np.max(_c))

        _c[np.argmax(_c)] = 7 + c_off   # set a nail to stretch the cmap to the look that we want
        _c[np.argmin(_c)] = 0   # set a lower one too

        ax.scatter(temp_x,
                   temp_y,
                   temp_z,
                   zdir='z',
                   cmap=_colors[idx],
                   c=_c,
                   s=sizes[idx],
    #                norm=colors.PowerNorm(gamma=3.25/3.5),
#                    label=class_names[idx],
                   alpha=alphas[idx])

        ax.set_title(class_names[idx])
        ax.set_xlim(-3.0, 3.0)
        ax.set_ylim(-4, 3)
        ax.set_zlim(-3, 3)
    #     ax.w_xaxis.set_pane_color((0.98, 0.98, 0.98, 1.0))
    #     ax.w_yaxis.set_pane_color((0.98, 0.98, 0.98, 1.0))
    #     ax.w_zaxis.set_pane_color((0.98, 0.98, 0.98, 1.0))
        ax.set_axis_off()
        ax.autoscale_view(tight=True)

#         ax.set_facecolor("black")
        
#         ax.legend(fontsize=14, loc=1)
        ax.view_init(15 + 15 * np.sin(ang * np.pi / 180), ang)

    
#     ax = fig.add_subplot(1, 2, 2, projection='3d')

#     COUNT = 5000

    plt.subplots_adjust(top=1.0, bottom=0.0, left=0.0, right=1.0, wspace=0.0, hspace=0.0)
    filename = "../imgs/{:03d}.png".format(aaa)
    plt.savefig(filename, dpi=100)
#     plt.gca()
#     plt.clf()
    plt.close()

In [119]:
x_dim[rage.iloc[:,2]==1]
# x_dim[rage.iloc[:,2]==1][:300]

array([-3.8704715 , -0.02555365, -2.241838  , ...,  0.4592499 ,
        0.36266276,  1.9928123 ], dtype=float32)

In [55]:
# %%opts Scatter3D [width=800 height=800 camera_zoom=200 alpha=0.5 color_index=2] (size=10 cmap='viridis')

# hv.Scatter3D(zip(calm_x[:3000],
#             calm_y[:3000],
#             calm_z[:3000]))

# hv.Scatter3D(zip(rage_x[rage.iloc[:,0]==1], rage_y[rage.iloc[:,0]==1], rage_z[rage.iloc[:,0]==1]))

In [121]:
for x in range(6):
    print((x % 3) + 1)

1
2
3
1
2
3
