## Installtion and Links

In [None]:
# pip install wordview
# GitHub: https://github.com/meghdadFar/wordview
# Demo: https://github.com/meghdadFar/wordview/blob/main/notebooks/demo.ipynb
# Docs: https://github.com/meghdadFar/wordview/tree/main/docs

## Imports

In [1]:
import pandas as pd
import numpy as np
import random
import json

from wordview.text_analysis import TextStatsPlots, LabelStatsPlots

## Data

In [2]:
imdb_df = pd.read_csv("../data/IMDB_Dataset_sample_5k.csv")

## General Analysis of Text and Labels

In [3]:
tsp = TextStatsPlots(df=imdb_df.sample(100), text_column='review')

core            - 272 - INFO - Processing text in review column of the input DataFrame...
100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 121.06it/s]
core            - 313 - INFO - Calculating Empirical and Theoretical Zipf values...
core            - 318 - INFO - Time to measure predicted proportion for 6140 rows: 0.0021560192108154297


### Distributions

In [14]:
# tsp.show_distplot('doc_len')

# You can customize the layout using the argument: `layout_settings`.
# E.g.
# layout_settings = {'plot_bgcolor':'lightblue',
#                    'paper_bgcolor': 'rgba(170, 248, 246, 1)',
#                    'hovermode': 'y'
#                   }
# For a full list of possible options, see:
# https://plotly.com/python/reference/layout/

tsp.show_distplot('doc_len',
                  layout_settings=layout_settings)

In [None]:
tsp.show_distplot('word_frequency_zipf')
# https://medium.com/@_init_/using-zipfs-law-to-improve-neural-language-models-4c3d66e6d2f6

### Word Clouds

In [None]:
tsp.show_word_clouds('VB')
# tsp.show_word_clouds('NN')
# tsp.show_word_clouds('JJ')
# plot_settings={'plot_bgcolor': 'rgba(0, 0, 0, 0)'},
#                     wc_settings={'color':'blue', 'max_words':300}

### Labels

In [None]:
lsp = LabelStatsPlots(df=imdb_df, label_columns=[('sentiment', 'categorical')])
lsp.show_label_plots()

In [None]:
imdb_df['numerical_label'] = np.random.randint(1, 500, imdb_df.shape[0])
imdb_df['label2'] = random.choices(['a', 'b', 'c', 'd'], [0.2, 0.5, 0.8, 0.9], k=imdb_df.shape[0])
imdb_df['numerical_label2'] = np.random.randint(1, 500, imdb_df.shape[0])
lsp = LabelStatsPlots(df=imdb_df, label_columns=[('sentiment', 'categorical'),
                                                    ('label2', 'categorical'),
                                                    ('numerical_label', 'numerical'),
                                                    ('numerical_label2', 'numerical')
                                                   ]
                     )
lsp.show_label_plots()

In [None]:
lsp.show_label_plots(plot_bgcolor='rgba(0, 0, 0, 0)',
                     coloraxis=dict(colorscale='Tealgrn'),
                     coloraxis_showscale=False, 
                     showlegend=False)

## Extraction & Analysis of MWEs

In [None]:
from wordview.mwes import MWE
from wordview.preprocessing import NgramExtractor

In [None]:
# If Ngram extraction was not carried out before, we need to run it here as its results are required by MWE
extractor = NgramExtractor(imdb_df, "review")
extractor.extract_ngrams()
ngram_counts = extractor.get_ngram_counts()

In [None]:
mwe_obj = MWE(imdb_df, 'review',
              ngram_count_source=ngram_counts,
              language='EN', 
              custom_patterns="NP: {<DT>?<JJ>*<NN>}",
              only_custom_patterns=False,
              )
# You can also pass ngram_count_file_path='data/ngram_counts.json' instead.
# If no custom_pattern is passed, LVC, 2-3 word NC, ADJ-NC, and VPCs are returnd (up to 4 grams)

In [None]:
mwes = mwe_obj.extract_mwes(sort=True, top_n=10)
#  If you want, you can store the MWE file in a json or any other format
# json.dump(mwes, open('data/mwes.json', 'w'), indent=4)
mwe_obj.print_mwe_table()

## Bias Analysis

**Important Note**: Please note that the following examples with biases are constructed using an LLM to illustrate the functionality of the bias detection module. They reflect negative stereotypes or biases and are not to be endorsed or perpetuated. 

In [None]:
biased_df = pd.DataFrame(
    {
        "text": [
            # Gender
            "He is always so successful because men are naturally gifted.",
            "She is too emotional to lead; women cannot handle pressure.",
            # Racial
            "Asians are brilliant at mathematics.",
            "Black people are not fit for this kind of job.",
            "Indian food is too spicy, and their traditions are too complex to understand.",
            "Latinos are one of the many ethnic groups in the region.",  # neutral
            # Religious
            "Christians are the only ones who know the true path to salvation.",
            "Muslims cannot be trusted in our community.",
            "Atheists often have a logical and evidence-based approach to understanding the world.",
        ]
    }
)

In [None]:
from wordview.bias_analysis  import BiasDetector
bias_detector = BiasDetector(biased_df, "text")
bias_detector.detect_bias()

In [None]:
bias_detector.show_bias_plot()

In [None]:
bias_detector.print_bias_table()