In [1]:
from wordview.text_analysis import TextStatsPlots, LabelStatsPlots
from wordview.anomaly import NormalDistAnomalies
import pandas as pd
import json
from tabulate import tabulate
from sklearn.feature_extraction.text import TfidfVectorizer
import plotly.figure_factory as ff

In [2]:
imdb_train = pd.read_csv('../data/imdb_train_sample.tsv', sep='\t', names=['label', 'text'])
imdb_train = imdb_train.sample(100)

In [3]:
tsp = TextStatsPlots(df=imdb_train, text_column='text')

core            - 258 - INFO - Processing text in text column of the input DataFrame...
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:02<00:00, 39.79it/s]
core            - 299 - INFO - Calculating Empirical and Theoretical Zipf values...
core            - 304 - INFO - Time to measure predicted proportion for 5125 rows: 0.005500316619873047


In [7]:
tsp.show_word_clouds('NN', 
                    go_plot_settings={'plot_bgcolor': 'rgba(0, 0, 0, 0)','plot_bgcolor': 'rgba(0, 0, 0, 0)'},
                    wc_settings={'color':'red', 'max_words':1000})

In [8]:
tsp.show_word_clouds(pos="NN",
                     go_plot_settings={'plot_bgcolor': 'rgba(0, 0, 0, 0)',
                                        'plot_bgcolor': 'rgba(0, 0, 0, 0)'},
                     wc_settings={'color':'deepskyblue', 'max_words':10}
                    )

In [None]:
tsp.show_distplot('doc_len')

In [None]:
# IDF
# vectorizer = TfidfVectorizer(min_df=1)
# X = vectorizer.fit_transform(imdb_train["text"])
# idf = vectorizer.idf_
# token_score_dict = dict(zip(vectorizer.get_feature_names(), idf))

# Word Count
token_score_dict = tsp.analysis.token_to_count_dict

In [None]:
nda = NormalDistAnomalies(items=token_score_dict, gaussianization_strategy='brute')

In [None]:
nda.item_value_df.head(1000)

In [None]:
nda.show_plot(type='normal', bin_size=1)

In [None]:
nda.show_plot()

## Anomaly Experiments

In [None]:
from wordview.anomaly import gaussianize
from numpy.random import randn

# values = data = randn(1000)
values = nda.item_value_df['guassian_values']
values2 = nda.item_value_df['representative_value']

g = gaussianize.Gaussianize(strategy='lambert')
g.fit(values)
res = g.transform(values)
# res = res.flatten().tolist()


print(shapiro(res).pvalue)

def show_fig(x, curve_type):
    fig = ff.create_distplot(
            [x],
            group_labels=['Value'],
            bin_size=1,
            curve_type=curve_type,  # override default 'kde'
        )
    fig.show()

# show_fig(x=values, curve_type='normal')
# show_fig(x=values2, curve_type='kde')


# print(values)
# print('==========')
# print(res.flatten().tolist())

### Use of shapiro test

In [None]:
# from scipy.stats import norm
from scipy.stats import shapiro

my_data = nda.item_value_df['representative_value']
shapiro(my_data.to_list())

In [None]:
my_data = nda.item_value_df['guassian_values']
shapiro(my_data.to_list())

In [None]:
from numpy.random import randn
my_data = randn(1000)
print(shapiro(my_data))
print(kstest(my_data, 'norm'))

In [None]:
print(shapiro(nda.item_value_df['guassian_values'].tolist()))
print(kstest(nda.item_value_df['guassian_values'].tolist(), 'norm'))
show_fig(x=nda.item_value_df['guassian_values'], curve_type='normal')

### Use of px

In [None]:
import plotly.express as px
df = px.data.tips()
fig = px.histogram(df, x="total_bill", y="tip", color="sex", marginal="rug",
                   hover_data=df.columns)
fig.show()
df.head()

In [None]:
fig = px.histogram(nda.item_value_df, x="guassian_values")
fig.show()
fig2 = px.histogram(nda.item_value_df, x="representative_value")
fig2.show()

## Text Analysis

In [None]:
tsp.show_stats()

In [None]:
# tsp.show_distplot(plot='doc_len')
tsp.show_distplot(plot='word_frequency_zipf')

In [None]:
tsp.show_word_clouds(type="JJ")

In [None]:
# import plotly.graph_objs as go
# import plotly.figure_factory as ff
# fig_w_freq = go.Figure()
# fig_w_freq.add_trace(go.Scattergl(x=tsp.analysis.zipf_x,
#                                               y=tsp.analysis.zipf_y_emp,
#                                               mode='markers',
#                                               marker=dict(
#                                                     color=tsp.analysis.zipf_x,
#                                                     colorscale='Tealgrn',
#                                                     )
#                                             )
#                                 )
# fig_w_freq.add_trace(go.Scattergl(x=tsp.analysis.zipf_x,
#                                   y=tsp.analysis.zipf_y_theory,
#                                   mode='markers',
#                                   marker=dict(color=tsp.analysis.zipf_x,
#                                               colorscale='Reds'
#                                              )
#                                  )
#                     )

# dist_plot_setup = {
#             # 'paper_bgcolor': '#007A78',
#             'showlegend' : False
#             }
# fig_w_freq.update_layout(dist_plot_setup)
# fig_w_freq.show()

In [None]:
import numpy as np
import random

# In addition to the original label, for illustration purpose, let's create two random labels:
imdb_train['numerical_label'] = np.random.randint(1, 500, imdb_train.shape[0])
imdb_train['label2'] = random.choices(['a', 'b', 'c', 'd'], [0.2, 0.5, 0.8, 0.9], k=imdb_train.shape[0])
imdb_train['numerical_label2'] = np.random.randint(1, 500, imdb_train.shape[0])

In [None]:
lsp = LabelStatsPlots(df=imdb_train, label_columns=[('label', 'categorical'),
                                                    ('label2', 'categorical'),
                                                    ('numerical_label', 'numerical'),
                                                    ('numerical_label2', 'numerical')
                                                   ]
                     )

In [None]:
lsp.show_label_plots()

In [None]:
# lsp.labels_fig.write_html('tmp.html')

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots
t1 = go.Histogram(x=imdb_train['numerical_labe2'],
                  marker=dict(line=dict(width=0.8,
                                        color="white")
                             )
                 )
figure = make_subplots(rows=1, cols=1)
figure.append_trace(t1, 1, 1)
figure.show()

In [None]:
t1 = go.Histogram(x=imdb_train['numerical_labe2'],
                  marker=dict(line=dict(width=0.8,
                                        color="white")
                             )
                 )
figure = make_subplots(rows=1, cols=1)
figure.append_trace(t1, 1, 1)
figure.show()

In [None]:
import plotly.express as px
res = imdb_train['new_label'].value_counts()
res.columns = ['label', 'count']
fig = px.box(res)

fig.update_traces(quartilemethod="exclusive") # or "inclusive", or "linear" by default
fig.show()

In [None]:
from wordview.mwes import MWE

In [None]:
mwe = MWE(df=imdb_train, mwe_types=["NC", "JNC"], text_column='text')

In [None]:
mwe.build_counts(counts_filename='../tmp/counts.json')

In [None]:
mwes_dict = mwe.extract_mwes(counts_filename='../tmp/counts.json', mwes_filename='../tmp/mwes1.json')
mwes_nc = {k: v for k, v in mwes_dict['NC'].items()}
top_mwes_nc = [[k, v] for k,v in mwes_nc.items()][:10]
print(tabulate(top_mwes_nc, tablefmt="double_outline"))

In [None]:
mwe.extract_mwes(counts_filename='../tmp/counts.json', mwes_filename='../tmp/mwes.json')

In [None]:
from wordview.mwes import hyphenate_mwes

In [None]:
new_df = hyphenate_mwes(path_to_mwes='../tmp/mwes.json', mwe_types=['NC', 'JNC'], df=imdb_train, text_column='text')

In [None]:
from wordview.preprocessing import RedunTerms
rt = RedunTerms(imdb_train["text"], method='idf')

In [None]:
with open('../tmp/counts.json') as json_file:
    counts_dict = json.load(json_file)
with open('../tmp/mwes.json') as json_file:
    mwes_dict = json.load(json_file)

In [None]:
nc_counts = {k: v for k, v in sorted(counts_dict['NC'].items(), key=lambda item: item[1], reverse=True)}
nc_association = {k: v for k, v in mwes_dict['NC'].items()}

In [None]:
top_nc_table = [[k, v] for k,v in nc_counts.items()][:10]
top_nc_association_table = [[k, v] for k,v in nc_association.items()][:10]

In [None]:
print(tabulate(top_nc_table, tablefmt="simple_grid"))

In [None]:
tformat = "double_outline" #simple_grid"
print(tabulate(top_nc_association_table, tablefmt=tformat))