# RQ3 sampling method:
Here, we explore different possibilities of sampling:
1. Sampling by sentences word count
2. Sampling by sentence character count
2. Use of intervals of word count/character count vs quantiles selection

Requirements:
1. At least 33 sentences of each sentiment should be available since the total sample size used is 99.
2. Must have some support in metrics (no guessing values)
3. Prefer distant groups to improve prospects of results.

## Install plotly

In [50]:
%pip install plotly


[notice] A new release of pip available: 22.1.2 -> 22.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip
Note: you may need to restart the kernel to use updated packages.




## Plot histogram distribution for Word Count

In [71]:
import plotly.express as px
import pandas as pd

tweets = pd.read_csv('Tweets_dataset.csv')
tweets['word_count'] = tweets.text.apply(lambda t: len(t.split()))

tweets_sentiments_count = tweets.loc[ : , ['airline_sentiment', 'word_count']] \
                                .sort_values(by='airline_sentiment')

## grouped histogram
fig = px.histogram(tweets_sentiments_count, 
                   x='word_count',
                   opacity=0.75,
                   marginal="box",
                   color='airline_sentiment', histfunc="count")

fig.update_layout(xaxis = dict(tickmode = 'linear', tick0 = 0, dtick = 1),
                  legend=dict(
                        yanchor="bottom",
                        y=0.45,
                        xanchor="left",
                        x=0.75
                    ),
                  barmode='group', # ['stack', 'group', 'overlay', 'relative']
                  title='Word count distribution of sentences in the dataset per class',
                  bargap=0.2, # gap between bars of adjacent location coordinates
                  bargroupgap=0.1, # gap between bars of the same location coordinates
                  xaxis_title="sentence word count",
                  yaxis_title="frequency",
                  font_size=14)

fig.add_hline(y=33, line_dash="dot",
              annotation_text="Min sentences needed (33) - each class", 
              annotation_position="bottom right",
              row=1)
fig.write_image("word_count distribution per class.pdf")
fig.show()

## ungrouped histogram
fig = px.histogram(tweets_sentiments_count, 
                   x='word_count',
                   opacity=0.75,
                   marginal="box",
                   histfunc="count")
fig.add_hline(y=99, line_dash="dot",
              annotation_text="Min sentences needed (99) - Full sample", 
              annotation_position="bottom right",
              row=1)
fig.add_vline(x=12, line_dash="dash", row=1, line_color="#b1b1b1")
fig.add_vline(x=19, line_dash="dash", row=1, line_color="#b1b1b1")
fig.add_vline(x=23, line_dash="dash", row=1, line_color="#b1b1b1")
fig.add_annotation(x=12, y=1100, text="Q1 = 12", showarrow=False)
fig.add_annotation(x=19, y=1100, text="Q2 = 19", showarrow=False)
fig.add_annotation(x=23, y=1100, text="Q3 = 23", showarrow=False)

fig.update_layout(xaxis = dict(tickmode = 'linear', tick0 = 0, dtick = 1),
                  barmode='group', # ['stack', 'group', 'overlay', 'relative']
                  title='Word count distribution of sentences in the dataset',
                  bargap=0.2, # gap between bars of adjacent location coordinates
                  bargroupgap=0.1, # gap between bars of the same location coordinates
                  xaxis_title="sentence word count",
                  yaxis_title="frequency",
                  font_size=14)
fig.write_image("word_count distribution.pdf")
fig.show()

## Plot histogram for Char Count

In [52]:
import plotly.express as px
import pandas as pd

tweets = pd.read_csv('Tweets_dataset.csv')
tweets['char_count'] = tweets.text.apply(lambda t: len(t))

tweets_sentiments_count = tweets.loc[ : , ['airline_sentiment', 'char_count']] \
                                .sort_values(by='airline_sentiment')
max_char_count = int(tweets_sentiments_count.char_count.max())

## grouped histogram
fig = px.histogram(tweets_sentiments_count, 
                   x='char_count',
                   opacity=0.75,
                   nbins=max_char_count,
                   marginal="box",
                   color='airline_sentiment', histfunc="count")
fig.update_layout(xaxis = dict(tickmode = 'linear', tick0 = 0, dtick = 1),
                  barmode='group', # ['stack', 'group', 'overlay', 'relative']
                  title='Dataset sentence char count distribution per class',
                  bargap=0.2, # gap between bars of adjacent location coordinates
                  bargroupgap=0.1, # gap between bars of the same location coordinates
                  xaxis_title="sentence char count", yaxis_title="frequency")
fig.add_hline(y=33, line_dash="dot",
              annotation_text="Min sentences needed (33) - Each class", 
              annotation_position="bottom right",
              row=1)
fig.show()

## ungrouped histogram
fig = px.histogram(tweets_sentiments_count, 
                   x='char_count',
                   opacity=0.75,
                   nbins=max_char_count,
                   marginal="box",
                   histfunc="count")
fig.add_hline(y=99, line_dash="dot",
              annotation_text="Min sentences needed (99) - Full sample", 
              annotation_position="bottom right",
              row=1)
fig.update_layout(xaxis = dict(tickmode = 'linear', tick0 = 0, dtick = 1),
                  barmode='group', # ['stack', 'group', 'overlay', 'relative']
                  title='Dataset sentence char count distribution',
                  bargap=0.1, # gap between bars of adjacent location coordinates
                  bargroupgap=0, # gap between bars of the same location coordinates
                  xaxis_title="sentence char count", yaxis_title="frequency")

fig.show()

## <mark>Discovery</mark>: Selection by character count is not viable, since small character count sentences are in very small quantities

# Comparing the Word ranges approach (5-10, 10-15, 15-29) to Quantiles approach (12, 19 23)
## - Word ranges approach (5-10, 10-15, 15-29):

In [53]:
import plotly.express as px
import pandas as pd
import numpy as np
import plotly.graph_objects as go

tweets = pd.read_csv('Tweets_dataset.csv')
tweets['word_count'] = tweets.text.apply(lambda t: len(t.split()))
tweets['char_count'] = tweets.text.apply(lambda t: len(t))

tweets_sentiments_count = tweets.loc[ : , ['text','airline_sentiment', 'word_count', 'char_count']] \
                                .sort_values(by='airline_sentiment')
max_word_count = int(tweets_sentiments_count.word_count.max())

tweets_sentiments_wcount_1 = tweets_sentiments_count.query('word_count >= 5 and word_count <= 10')
tweets_sentiments_wcount_2 = tweets_sentiments_count.query('word_count >= 10 and word_count <= 15')
tweets_sentiments_wcount_3 = tweets_sentiments_count.query('word_count >= 15 and word_count <= 20')

print(f'5-10 len {len(tweets_sentiments_wcount_1)}, 10-15 len {len(tweets_sentiments_wcount_2)} 15-20 len {len(tweets_sentiments_wcount_3)}')

fig = go.Figure()

fig.add_trace(go.Histogram(
    x = tweets_sentiments_wcount_1.word_count,
    name='5-10 word_count', # name used in legend and hover labels
    nbinsx=max_word_count,
    marker_color='#EB89B5',
    opacity=0.75
))

fig.add_trace(go.Histogram(
    x = tweets_sentiments_wcount_2.word_count,
    name='10-15 word_count', # name used in legend and hover labels
    nbinsx=max_word_count,
    marker_color='#330C73',
    opacity=0.75
))

fig.add_trace(go.Histogram(
    x = tweets_sentiments_wcount_3.word_count,
    name='15-20 word_count', # name used in legend and hover labels
    nbinsx=max_word_count,
    marker_color='#77dd77',
    opacity=0.75,
))

fig.update_layout(barmode='group', # ['stack', 'group', 'overlay', 'relative']
                  title='Word count by word count group',
                  bargap=0, # gap between bars of adjacent location coordinates
                  bargroupgap=0, # gap between bars of the same location coordinates
                  xaxis = dict(tickmode = 'linear', tick0 = 0, dtick = 1),
                  xaxis_title="sentence word count", yaxis_title="frequency")


fig.show()

5-10 len 2179, 10-15 len 2899 15-20 len 3776


## - Quantiles approach (12, 19 23)

In [54]:
import plotly.express as px
import pandas as pd
import numpy as np
import plotly.graph_objects as go

# count, division = np.histogram(series)

tweets = pd.read_csv('Tweets_dataset.csv')
tweets['word_count'] = tweets.text.apply(lambda t: len(t.split()))
tweets['char_count'] = tweets.text.apply(lambda t: len(t))

tweets_sentiments_count = tweets.loc[ : , ['text','airline_sentiment', 'word_count', 'char_count']] \
                                .sort_values(by='airline_sentiment')
max_word_count = int(tweets_sentiments_count.word_count.max())

tweets_sentiments_wcount_1 = tweets_sentiments_count.query('word_count == 12')
tweets_sentiments_wcount_2 = tweets_sentiments_count.query('word_count == 19')
tweets_sentiments_wcount_3 = tweets_sentiments_count.query('word_count == 23')

print(f'Quantitites: 12 len {len(tweets_sentiments_wcount_1)}, 19 len {len(tweets_sentiments_wcount_2)} 23 len {len(tweets_sentiments_wcount_3)}')

fig = go.Figure()

fig.add_trace(go.Histogram(
    x = tweets_sentiments_wcount_1.word_count,
    name='12 word_count', # name used in legend and hover labels
    nbinsx=max_word_count,
    marker_color='#EB89B5',
    opacity=0.75
))

fig.add_trace(go.Histogram(
    x = tweets_sentiments_wcount_2.word_count,
    name='19 word_count', # name used in legend and hover labels
    nbinsx=max_word_count,
    marker_color='#330C73',
    opacity=0.75
))

fig.add_trace(go.Histogram(
    x = tweets_sentiments_wcount_3.word_count,
    name='23 word_count', # name used in legend and hover labels
    nbinsx=max_word_count,
    marker_color='#77dd77',
    opacity=0.75,
))

fig.update_layout(barmode='group', # ['stack', 'group', 'overlay', 'relative']
                  title='Word count selecting by quantiles',
                  bargap=0, # gap between bars of adjacent location coordinates
                  bargroupgap=0, # gap between bars of the same location coordinates
                  xaxis = dict(tickmode = 'linear', tick0 = 0, dtick = 1),
                  xaxis_title="sentence word count", yaxis_title="frequency")


fig.show()

Quantitites: 12 len 477, 19 len 668 23 len 1025


**Prior knowledge**: Word ranges approach has been alread tested and leaded to very small differences, so we're seeking for more distance groups

## <mark>Hypothesis</mark>: Word ranges approach has too similar groups. Quantiles approach can bring about stronger results.

# Statistical Tests

In [55]:
import scipy.stats as stats

def read_metrics(path: str):
    df = pd.read_json(path)
    df = df[['provider', 'noise_algorithm', 'noise_level', 'fmeasure']]

    providers = df['provider'].unique()
    noise_algorithms = df['noise_algorithm'].unique()

    return df, providers, noise_algorithms

def create_empty_row(noise_algorithms):
    schema = {'providers': []}
    for n in noise_algorithms:
        schema[n] = []
    return schema

def peform_test(x_values, y_values, alternative='greater'):
    # perform test. You can use 'two-sided', 'greater' or 'less' for one-sided test
    return stats.mannwhitneyu(x=x_values, y=y_values, alternative = 'greater').pvalue

def run_mannwhitneyu(df, providers, noise_algorithms):
    data = pd.DataFrame()
    results = {}

    table = pd.DataFrame(data=create_empty_row(noise_algorithms))

    for p1 in providers:
        for p2 in providers:
            new_row = {'providers':f'{p1.capitalize()} x {p2.capitalize()}'}
            for n in noise_algorithms:
                provider1_medians = df.query(f'provider == "{p1}" and noise_algorithm == "{n}"').fmeasure
                provider2_medians = df.query(f'provider == "{p2}" and noise_algorithm == "{n}"').fmeasure
                new_row[n] = peform_test(provider1_medians, provider2_medians)
            table = table.append(new_row, ignore_index=True)

    return table

## RQ1: Comparing providers and noises results

In [56]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats

df, providers, noise_algorithms = read_metrics("outputs/experiment1/size99_07-12-2022 09_34_29/results/metrics.json")

df = df.groupby(['provider', 'noise_level'])['fmeasure'] \
                .median() \
                .reset_index()
df = df.pivot(index='noise_level', columns='provider', values='fmeasure').reset_index()

# get summary statistics
df.agg(["count", "min", "max", "median", "mean", "skew"])

# # generate boxplot to check data spread
df.boxplot(column=['amazon', 'google', 'microsoft'], grid=False)

# perform two-sided test. You can use 'greater' or 'less' for one-sided test
amazon_google    = stats.mannwhitneyu(x=df['amazon'], y=df['google'], alternative = 'greater')
amazon_microsoft = stats.mannwhitneyu(x=df['amazon'], y=df['microsoft'], alternative = 'greater')
google_amazon    = stats.mannwhitneyu(x=df['google'], y=df['amazon'], alternative = 'greater')
google_microsoft = stats.mannwhitneyu(x=df['google'], y=df['microsoft'], alternative = 'greater')
microsoft_amazon = stats.mannwhitneyu(x=df['microsoft'], y=df['amazon'], alternative = 'greater')
microsoft_google = stats.mannwhitneyu(x=df['microsoft'], y=df['google'], alternative = 'greater')

print({
    'amazon_google':amazon_google.pvalue,
    'amazon_microsoft':amazon_microsoft.pvalue,
    'google_amazon':google_amazon.pvalue,
    'google_microsoft':google_microsoft.pvalue,
    'microsoft_amazon':microsoft_amazon.pvalue,
    'microsoft_google':microsoft_google.pvalue
})

{'amazon_google': 0.6612075210237622, 'amazon_microsoft': 0.8276478889965212, 'google_amazon': 0.3668649978481236, 'google_microsoft': 0.714624805970913, 'microsoft_amazon': 0.19233653136775436, 'microsoft_google': 0.3115881119410587}


In [57]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import logging

df, providers, noise_algorithms = read_metrics("outputs/experiment1/size99_07-12-2022 09_34_29/results/metrics.json")

results = run_mannwhitneyu(df, providers, noise_algorithms)

results.to_excel("./statistical_tests/mannwhitneyu_noise.xlsx", index=False)

results.head()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated a

Unnamed: 0,providers,Keyboard,OCR,RandomCharReplace,CharSwap,WordSwap,WordSplit,Antonym,Synonym,Spelling,TfIdfWord,WordEmbeddings,ContextualWordEmbs,no_noise
0,Google x Google,0.515132,0.515132,0.515132,0.515132,0.515132,0.515178,0.515132,0.515132,0.515132,0.515132,0.515132,0.515132,1.0
1,Google x Microsoft,0.026052,0.010519,0.106147,0.136518,0.999835,0.338506,0.999147,0.973049,0.739739,0.425053,0.994335,0.763662,1.0
2,Google x Amazon,0.026906,0.008629,0.060262,0.106147,0.999343,0.28523,0.99915,0.919014,0.311588,0.236338,0.763662,0.366865,1.0
3,Microsoft x Google,0.978228,0.991413,0.907062,0.879339,0.00022,0.688739,0.001101,0.032011,0.285375,0.604332,0.00701,0.260261,0.5
4,Microsoft x Microsoft,0.516141,0.515224,0.515132,0.515132,0.515132,0.515319,0.515132,0.515132,0.515132,0.515132,0.515132,0.515132,1.0


## RQ3 different sentence sizes

In [58]:
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import logging

# ######### sentence size = 12
df, providers, noise_algorithms = read_metrics("outputs/experiment2/size100_12-21-2022 20_53_00/[12-12]/results/metrics.json")
results = run_mannwhitneyu(df, providers, noise_algorithms)
results.to_excel("statistical_tests/rq3_12_mannwhitneyu_noise.xlsx", index=False)

# ######### sentence size = 19
df, providers, noise_algorithms = read_metrics("outputs/experiment2/size100_12-21-2022 20_53_00/[19-19]/results/metrics.json")
results = run_mannwhitneyu(df, providers, noise_algorithms)
results.to_excel("statistical_tests/rq3_19_mannwhitneyu_noise.xlsx", index=False)

####### sentence size = 23
df, providers, noise_algorithms = read_metrics("outputs/experiment2/size100_12-21-2022 20_53_00/[23-23]/results/metrics.json")
results = run_mannwhitneyu(df, providers, noise_algorithms)
results.to_excel("statistical_tests/rq3_23_mannwhitneyu_noise.xlsx", index=False)

results.head()


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.


The frame.append method is deprecated a

Unnamed: 0,providers,Keyboard,OCR,RandomCharReplace,CharSwap,WordSwap,WordSplit,Antonym,Synonym,Spelling,TfIdfWord,WordEmbeddings,ContextualWordEmbs,no_noise
0,Google x Google,0.515132,0.515132,0.515132,0.515132,0.515132,0.515132,0.515132,0.515132,0.515132,0.515132,0.515132,0.515132,1.0
1,Google x Amazon,0.031961,0.008629,0.031811,0.153745,0.999933,0.106147,0.984395,0.998195,0.688412,0.285375,0.863482,0.827648,1.0
2,Google x Microsoft,0.018459,0.005665,0.021772,0.052055,0.010567,0.018818,0.000384,0.015605,0.454861,0.236338,0.172352,0.425053,0.5
3,Amazon x Google,0.973094,0.99299,0.973229,0.863482,9.1e-05,0.907062,0.018818,0.002293,0.338792,0.739739,0.153745,0.192337,0.5
4,Amazon x Amazon,0.515178,0.515132,0.515319,0.515132,0.515132,0.515132,0.515132,0.515132,0.515132,0.515132,0.515132,0.515132,1.0


In [59]:
from noise_insertion.percent_insertion import noises
import noise_insertion.utils as utils

text1 = "the white fox is a furry guy dressed like boy"
text = "the white f"

# result = noises.Keyboard([text1], 0.5)
# result = noises.OCR([text1], 0.6)
# result = noises.RandomCharReplace([text1], 0.50)
# result = noises.CharSwap([text1], 0.5)

# result = noises.Antonym([text1], 0.1)
result = noises.Synonym([text1], 0.2)
# result = noises.WordEmbeddings([text1], 0.1)
# result = noises.ContextualWordEmbs([text1], 0.1)
# result = noises.WordSwap([text1], 0.1)
# result = noises.Spelling([text1], 0.1)
# result = noises.WordSplit([text1], 0.1)
# result = noises.TfIdfWord([text1], 0.1)
print(text1)
print(result[0])
utils.return_sentence_similarity(text1, result[0])

the white fox is a furry guy dressed like boy
the clean fox is a furred guy dressed like boy
equals:19, size:46, diference:27


0.41304347826086957