## Installtion and Links

In [19]:
# pip install wordview
# GitHub: https://github.com/meghdadFar/wordview
# Demo: https://github.com/meghdadFar/wordview/blob/main/notebooks/demo.ipynb
# Docs: https://github.com/meghdadFar/wordview/tree/main/docs

## Imports

In [1]:
import pandas as pd
import numpy as np
import random
import json
from wordview.text_analysis import TextStatsPlots, LabelStatsPlots

# Ensure plots are properly rendered in Jupyter Lab
import plotly.io as pio
pio.renderers.default = 'iframe'

## Data

In [2]:
imdb_df = pd.read_csv("../data/IMDB_Dataset_sample_5k.csv")

## General Analysis of Text and Labels

In [22]:
tsp = TextStatsPlots(df=imdb_df, text_column='review')

core            - 313 - INFO - Processing text in review column of the input DataFrame...
100%|█████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [00:30<00:00, 162.53it/s]
core            - 354 - INFO - Calculating Empirical and Theoretical Zipf values...
core            - 359 - INFO - Time to measure predicted proportion for 91519 rows: 0.02927112579345703


### Describe

In [23]:
tsp.show_stats()

┌───────────────────┬─────────┐
│ Language/s        │ EN      │
├───────────────────┼─────────┤
│ Unique Words      │ 91,519  │
├───────────────────┼─────────┤
│ All Words         │ 625,022 │
├───────────────────┼─────────┤
│ Documents         │ 5,000   │
├───────────────────┼─────────┤
│ Median Doc Length │ 174.0   │
├───────────────────┼─────────┤
│ Nouns             │ 57,001  │
├───────────────────┼─────────┤
│ Adjectives        │ 32,191  │
├───────────────────┼─────────┤
│ Verbs             │ 21,244  │
└───────────────────┴─────────┘


### Distributions

In [24]:
# tsp.show_distplot('doc_len')

# You can customize the layout using the argument: `layout_settings`.
# E.g.
layout_settings = {'plot_bgcolor':'rgba(245, 245, 245, 1)',
                   'paper_bgcolor': 'rgba(255, 255, 255, 1)',
                   'hovermode': 'y'
                  }
# For a full list of possible options, see:
# https://plotly.com/python/reference/layout/


plot_settings = {'color': 'chartreuse'}
# aliceblue, antiquewhite, aqua, aquamarine, azure,
# beige, bisque, black, blanchedalmond, blue,
# blueviolet, brown, burlywood, cadetblue,
# chartreuse, chocolate, coral, cornflowerblue,
# cornsilk, crimson, cyan, darkblue, darkcyan,
# darkgoldenrod, darkgray, darkgrey, darkgreen,
tsp.show_distplot('doc_len',
                  layout_settings=layout_settings,
                  plot_settings=plot_settings)

In [25]:
plot_settings = {'theoritical_zipf_colorscale': 'Reds',
                 'emperical_zipf_colorscale': 'Greens',
                 'mode': 'markers'}
layout_settings = {'plot_bgcolor':'rgba(245, 245, 245, 1)',
                   'paper_bgcolor': 'rgba(255, 255, 255, 1)',
                   'hovermode': 'y'
                  }
tsp.show_distplot('word_frequency_zipf',
                  layout_settings=layout_settings,
                  plot_settings=plot_settings)
# https://medium.com/@_init_/using-zipfs-law-to-improve-neural-language-models-4c3d66e6d2f6

### Word Clouds

In [26]:
layout_settings = {'plot_bgcolor':'rgba(245, 245, 245, 1)',
                   'paper_bgcolor': 'rgba(255, 255, 255, 1)',
                   'hovermode': 'y'
                  }

plot_settings = {'color': 'darkgreen',
                 'max_words': 200}
tsp.show_word_clouds('VB', layout_settings=layout_settings, plot_settings=plot_settings)
# tsp.show_word_clouds('NN')
# tsp.show_word_clouds('JJ')

### Labels

In [27]:
lsp = LabelStatsPlots(df=imdb_df, label_columns=[('sentiment', 'categorical')])
lsp.show_label_plots()

In [28]:
imdb_df['numerical_label'] = np.random.randint(1, 500, imdb_df.shape[0])
imdb_df['label2'] = random.choices(['a', 'b', 'c', 'd'], [0.2, 0.5, 0.8, 0.9], k=imdb_df.shape[0])
imdb_df['numerical_label2'] = np.random.randint(1, 500, imdb_df.shape[0])
lsp = LabelStatsPlots(df=imdb_df, label_columns=[('sentiment', 'categorical'),
                                                    ('label2', 'categorical'),
                                                    ('numerical_label', 'numerical'),
                                                    ('numerical_label2', 'numerical')
                                                   ]
                     )
lsp.show_label_plots()

In [29]:
layout_settings = {'plot_bgcolor':'rgba(245, 245, 245, 1)',
                   'paper_bgcolor': 'rgba(255, 255, 255, 1)',
                   'hovermode': 'y',
                   'coloraxis': {'colorscale': 'peach'},
                   'coloraxis_showscale':True
                  }
# See here for a list of named color scales:
# https://plotly.com/python/builtin-colorscales/
lsp.show_label_plots(layout_settings=layout_settings)

## Extraction & Analysis of MWEs

In [2]:
from wordview.mwes import MWE
from wordview.preprocessing import NgramExtractor

In [3]:
# If Ngram extraction was not carried out before, we need to run it here as its results are required by MWE
extractor = NgramExtractor(imdb_df, "review")
extractor.extract_ngrams()
ngram_counts = extractor.get_ngram_counts()

NameError: name 'imdb_df' is not defined

In [32]:
mwe_obj = MWE(imdb_df, 'review',
              ngram_count_source=ngram_counts,
              language='EN', 
              custom_patterns="NP: {<DT>?<JJ>*<NN>}",
              only_custom_patterns=False,
              )
# You can also pass ngram_count_file_path='data/ngram_counts.json' instead.
# If no custom_pattern is passed, LVC, 2-3 word NC, ADJ-NC, and VPCs are returnd (up to 4 grams)

In [33]:
mwes = mwe_obj.extract_mwes(sort=True, top_n=10)
#  If you want, you can store the MWE file in a json or any other format
# json.dump(mwes, open('data/mwes.json', 'w'), indent=4)
mwe_obj.print_mwe_table()

list index out of range.                            
Skipping this sentence.
list index out of range.                            
Skipping this sentence.
52984it [00:38, 1380.98it/s]

╔════════════════════════╦═══════════════╗
║ LVC                    ║   Association ║
╠════════════════════════╬═══════════════╣
║ predate both Click     ║         34.9  ║
║ favorize any sex.      ║         32.66 ║
║ fight tooth decay      ║         32.17 ║
║ sue these a-holes      ║         31.24 ║
║ spur some enterprising ║         31.12 ║
║ undergo any catharsis  ║         30.66 ║
║ outsmart this egghead  ║         30.64 ║
║ apply any sterner      ║         30.33 ║
║ grace another set.I    ║         29.63 ║
║ announce another PPV   ║         29.59 ║
╚════════════════════════╩═══════════════╝

╔════════════════════════╦═══════════════╗
║ NC2                    ║   Association ║
╠════════════════════════╬═══════════════╣
║ fantasyland intermixed ║         22.08 ║
║ ducat ducats           ║         22.08 ║
║ liver pate             ║         22.08 ║
║ chenshaw melon         ║         22.08 ║
║ raisin loaf            ║         22.08 ║
║ enemiesthe Ancients                        ║      




## Bias Analysis

**Important Note**: Please note that the following examples with biases are constructed using an LLM to illustrate the functionality of the bias detection module. They reflect negative stereotypes or biases and are not to be endorsed or perpetuated. 

In [29]:
biased_df = pd.DataFrame(
    {
        "text": [
            # Gender
            "He is always so successful because men are naturally gifted.",
            "She is too emotional to lead; women cannot handle pressure.",
            # Racial
            "Asians are brilliant at mathematics.",
            "Black people are not fit for this kind of job.",
            "Indian food is too spicy, and their traditions are too complex to understand.",
            "Latinos are one of the many ethnic groups in the region.",  # neutral
            "White people can be found living in many regions around the world."
            # Religious
            "Christians are the only ones who know the true path to salvation.",
            "Muslims cannot be trusted in our community.",
            "Atheists often have a logical and evidence-based approach to understanding the world.",
        ]
    }
)

In [30]:
from wordview.bias_analysis  import BiasDetector
bias_detector = BiasDetector(biased_df, "text")
bias_detector.detect_bias()

SentenceTransformer - 66 - INFO - Load pretrained SentenceTransformer: distiluse-base-multilingual-cased-v2
SentenceTransformer - 105 - INFO - Use pytorch device: cpu
9it [00:00, 99.87it/s]
9it [00:00, 107.93it/s]
9it [00:00, 12783.18it/s]
9it [00:00, 12927.65it/s]
9it [00:00, 115.63it/s]
9it [00:00, 221.36it/s]
9it [00:00, 227.06it/s]
9it [00:00, 217.92it/s]
9it [00:00, 200.17it/s]
9it [00:00, 13467.26it/s]
9it [00:00, 12663.11it/s]
9it [00:00, 226.63it/s]
9it [00:00, 11278.38it/s]
9it [00:00, 13212.72it/s]
9it [00:00, 15147.97it/s]
9it [00:00, 211.88it/s]


{'gender': {'male': 4.0,
  'female': 1.0,
  'transgender': '-inf',
  'nonbinary': '-inf'},
 'racial': {'white': 3.0,
  'black': 0.0,
  'asian': 4.0,
  'latino': 2.0,
  'indian': 1.0,
  'middle_eastern': '-inf'},
 'religion': {'christian': '-inf',
  'muslim': 0.0,
  'jew': '-inf',
  'hindu': '-inf',
  'buddhist': '-inf',
  'atheist': 2.0}}

In [32]:
cyan_scopecolorscale = [
    [0.0, "#E0FFFF"],  # Lightest Cyan
    [0.25, "#B3E4E4"],  # Lighter Cyan
    [0.5, "#66C2C2"],   # Neutral Cyan
    [0.75, "#339999"],  # Darker Cyan
    [1.0, "#006666"],   # Darkest Cyan
]

built_in_CS = "algae"

# Example of available colorscales are:
# 'aggrnyl', 'agsunset', 'algae', 'amp', 'armyrose', 'balance',
# 'blackbody', 'bluered', 'blues', 'blugrn', 'bluyl', 'brbg',
# 'brwnyl', 'bugn', 'bupu', 'burg', 'burgyl', 'cividis', 'curl',

# You can reverse a colorscale by appending an _r to it, e.g. 

built_in_CS = "algae_r"

# See here for a full list:
# https://plotly.com/python/builtin-colorscales/

bias_detector.show_bias_plot(colorscale=cyan_scopecolorscale)

In [19]:
bias_detector.print_bias_table()

╔═════════════╦═══════════════╗
║ Gender      ║ Bias          ║
╠═════════════╬═══════════════╣
║ Male        ║ Very Positive ║
║ Female      ║ Negative      ║
║ Transgender ║ Unknown       ║
║ Nonbinary   ║ Unknown       ║
╚═════════════╩═══════════════╝

╔════════════════╦═══════════════╗
║ Racial         ║ Bias          ║
╠════════════════╬═══════════════╣
║ White          ║ Very Positive ║
║ Black          ║ Very Negative ║
║ Asian          ║ Very Positive ║
║ Latino         ║ Neutral       ║
║ Indian         ║ Negative      ║
║ Middle_eastern ║ Unknown       ║
╚════════════════╩═══════════════╝

╔════════════╦═══════════════╗
║ Religion   ║ Bias          ║
╠════════════╬═══════════════╣
║ Christian  ║ Unknown       ║
║ Muslim     ║ Very Negative ║
║ Jew        ║ Unknown       ║
║ Hindu      ║ Unknown       ║
║ Buddhist   ║ Unknown       ║
║ Atheist    ║ Neutral       ║
╚════════════╩═══════════════╝
