## Fairness Metrics on words

In [1]:
# Importing the required libraries
import numpy as np
import pandas as pd
from wordcloud import WordCloud

from sklearn.model_selection import train_test_split
import re
import pickle


In [2]:
# loading dataset
# importing the dataset
first_data = pd.read_csv('../data/fake_and_real_data.csv')
first_data.head()
second_data = pd.read_csv('../data/WELFake_Dataset.csv')

In [3]:
# Checking for missing values

second_data.isnull().sum()

# Dropping the missing values
second_data = second_data.dropna()
# Checking for duplicates

second_data.duplicated().sum()

# Drop unnamed column
second_data.drop('Unnamed: 0', axis=1, inplace=True)

In [4]:
# concatinating title and text
second_data['text'] = second_data['title'] + ' ' + second_data['text']
# Dropping the title column
second_data = second_data.drop(['title'], axis=1)



In [5]:
first_data.drop_duplicates(inplace=True)


first_data['label_t'] = 0


first_data.head

<bound method NDFrame.head of                                                    Text label  label_t
0      Top Trump Surrogate BRUTALLY Stabs Him In The...  Fake        0
1     U.S. conservative leader optimistic of common ...  Real        0
2     Trump proposes U.S. tax overhaul, stirs concer...  Real        0
3      Court Forces Ohio To Allow Millions Of Illega...  Fake        0
4     Democrats say Trump agrees to work on immigrat...  Real        0
...                                                 ...   ...      ...
9895   Wikileaks Admits To Screwing Up IMMENSELY Wit...  Fake        0
9896  Trump consults Republican senators on Fed chie...  Real        0
9897  Trump lawyers say judge lacks jurisdiction for...  Real        0
9898   WATCH: Right-Wing Pastor Falsely Credits Trum...  Fake        0
9899   Sean Spicer HILARIOUSLY Branded As Chickensh*...  Fake        0

[9865 rows x 3 columns]>

In [6]:
# Preprocessing the data
first_data['label_t'] = (first_data['label'] != 'Fake').astype(int)
first_data = first_data.drop(['label'], axis=1)
first_data = first_data.rename(columns={'label_t': 'label'})
first_data = first_data.rename(columns={'Text': 'text'})
first_data.head()


Unnamed: 0,text,label
0,Top Trump Surrogate BRUTALLY Stabs Him In The...,0
1,U.S. conservative leader optimistic of common ...,1
2,"Trump proposes U.S. tax overhaul, stirs concer...",1
3,Court Forces Ohio To Allow Millions Of Illega...,0
4,Democrats say Trump agrees to work on immigrat...,1


In [7]:
# Dropping the unnecessary columns
first_data.head()
# converting the text to lowercase
def preprocess(text):
    text = re.sub(r'[^\w\s\']', ' ', text)
    text = re.sub(r' +', ' ', text)
    return text.strip().lower()

first_data['text'] = first_data['text'].apply(preprocess)
second_data['text'] = second_data['text'].apply(preprocess)






In [8]:
first_data.head()
second_data.head()

concat_data = pd.concat([first_data, second_data], axis=0)
concat_data.head()


Unnamed: 0,text,label
0,top trump surrogate brutally stabs him in the ...,0
1,u s conservative leader optimistic of common g...,1
2,trump proposes u s tax overhaul stirs concerns...,1
3,court forces ohio to allow millions of illegal...,0
4,democrats say trump agrees to work on immigrat...,1


In [9]:
# is trump inclouded 

concat_data['does_contain_trump'] = concat_data['text'].apply(lambda x: 'trump' in x)

In [10]:
concat_data.head()

Unnamed: 0,text,label,does_contain_trump
0,top trump surrogate brutally stabs him in the ...,0,True
1,u s conservative leader optimistic of common g...,1,True
2,trump proposes u s tax overhaul stirs concerns...,1,True
3,court forces ohio to allow millions of illegal...,0,True
4,democrats say trump agrees to work on immigrat...,1,True


In [11]:
concat_data['does_contain_trump'].value_counts()



does_contain_trump
True     41635
False    39767
Name: count, dtype: int64

In [13]:
# Simply import it from sklearn
from sklearn.model_selection import train_test_split

dataset = concat_data 

# Separating features from target feature
features = dataset.columns.tolist()
features.remove('label')
target = 'label'
X = dataset[features]
y = dataset[target]

# Define four sets and apply the function
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2, # 0.2 indicates a test set size of 20%
                                                    random_state=42)

In [15]:
## applying fairness tests
from sklearn.feature_extraction.text import TfidfVectorizer 
# Import the classifier and the metrics from sklearn
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, recall_score, f1_score

dt_clf = DecisionTreeClassifier(random_state=42)
vect = TfidfVectorizer(stop_words='english', max_df=0.7)
xv_train = vect.fit_transform(X_train['text'])
xv_test = vect.transform(X_test['text'])


# The fit function will do the trick
dt_clf.fit(xv_train, y_train)

# After the training phase, the model will be tested by predicting the values on the test set
dt_predictions = dt_clf.predict(xv_test)

dt_accuracy = accuracy_score(y_test, dt_predictions)
dt_recall = recall_score(y_test, dt_predictions)
dt_f1_score = f1_score(y_test, dt_predictions)
print(f"Decision Tree Accuracy: {dt_accuracy}")
print(f"Decision Tree Recall: {dt_recall}")
print(f"Decision Tree F1 Score: {dt_f1_score}")

Decision Tree Accuracy: 0.7105214667403722
Decision Tree Recall: 0.7023909647963474
Decision Tree F1 Score: 0.712709539774459


AttributeError: 'csr_matrix' object has no attribute 'head'

In [24]:

from aif360.datasets import BinaryLabelDataset
from aif360.datasets import StandardDataset
from aif360.metrics import ClassificationMetric


# We want to check the fairness level regarding the protected attribute "sex"
trump_features = ['does_contain_trump']


dataset = xv_test.toarray()
dataset['label'] = y_test  # and join the target feature with the others

predictions = dataset.copy(deep=True)  # we do the same task
# but this time the target feature is made by the predictions of our model
predictions['label'] = dt_predictions

# In this way, we have two datasets. One (dataset) is the original test set containing the original values of features,
# the other (predictions) contains the original values except for the target one, that is now made of model's predictions

# These will be used by AIF to compare the classifications of the model with the original values to
# understand if the model's answers create favouritism toward the privileged attribute


# This is the object made of the original dataset
aif_sex_dataset = BinaryLabelDataset(  # Base class for all structured datasets with binary labels.
    df=dataset,
    # This means that a prediction is biased toward the privileged attribute if its value is 1 (True)
    favorable_label=1,
    unfavorable_label=0,
    label_names=[target],
    protected_attribute_names=trump_features,
    # here we tell AIF that we want to check for predictions
    privileged_protected_attributes=['does_contain_trump'],
    # that somehow privilege the attribute "sex_Male"
)

# We do the same thing but with the predictions dataset
aif_sex_pred = BinaryLabelDataset(
    df=predictions,
    favorable_label=1,
    unfavorable_label=0,
    label_names=[target],
    protected_attribute_names=trump_features,
    privileged_protected_attributes=['does_contain_trump'],
)

trump_privileged_group = [{'does_contain_trump': 1}]
trump_unprivileged_group = [{'does_contain_trump': 0}]    

# We provide the ClassificationMetric object with all the information needed:
# aif_sex_dataset - The original test set
# aif_sex_pred - A dataset containing the predictions of the model
# sex_privileged_group - The privileged group
# sex_unprivileged_group - The unprivileged group
fairness_metrics = ClassificationMetric(dataset=aif_sex_dataset,
                                        classified_dataset=aif_sex_pred,
                                        unprivileged_groups=trump_unprivileged_group,
                                        privileged_groups=trump_unprivileged_group)

# Values less than 0 indicate that privileged group has higher
# proportion of predicted positive outcomes than unprivileged group.
# Value higher than 0 indicates that unprivileged group has higher proportion
# of predicted positive outcomes than privileged group.
SPD = round(fairness_metrics.statistical_parity_difference(), 3)

# Measures the deviation from the equality of opportunity, which means that the same
# proportion of each population receives the favorable outcome. This measure must be equal to 0 to be fair.
EOD = round(fairness_metrics.equal_opportunity_difference(), 3)

# Average of difference in False Positive Rate and True Positive Rate for unprivileged and privileged groups
# A value of 0 indicates equality of odds, which means that samples in both the privileged and unprivileged
# groups have the same probability of being classified positively.
AOD = round(fairness_metrics.average_odds_difference(), 3)

print(f"Statistical Parity Difference (SPD): {SPD}")
print(f"Equal Opportunity Difference (EOD): {EOD}")
print(f"Average Odds Difference: {AOD}")

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices