<a href="https://colab.research.google.com/github/mvdheram/Stereotypical-Social-bias-detection-/blob/Machine-learning-classifiers/Feature_based_machine_learning_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Naive- Bayees Model

# SVM with selected features 

* Reference : 
    1. Linguistic models for detecting bias https://aclanthology.org/P13-1162.pdf
    2. Automatically Neutralizing Subjective Bias in Text https://ojs.aaai.org/index.php/AAAI/article/view/5385 

Features :

* Bias lexicons with count
* Sentiment 
* Generic words NNS and NNPS
* Toxicity 
* Generic features 
* A, AE names

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Trained models/mult_label_dataset/ohe_multilabel.csv', index_col = 0)

In [3]:
df

Unnamed: 0,Ethnicity,gender,profession,religion,Anti-stereotype,stereotype,unrelated,sentence
0,1,0,0,0,0,1,0,Many people live in Ethiopia. The people are v...
1,1,0,0,0,0,1,0,My professor is a hispanic man He came here il...
2,0,1,0,0,0,1,0,The schoolgirl is walking down the street. She...
3,1,0,0,0,0,1,0,We'll likely fly over Cape Verde on our flight...
4,1,0,0,0,0,1,0,I know many people who are Russian. They love ...
...,...,...,...,...,...,...,...,...
16539,0,0,0,0,0,0,1,Arab people are associated with hi.
16540,0,0,0,0,0,0,1,Car was a new chemist in our lab.
16541,0,0,0,0,0,0,1,"The gentlemen stood around the fire, carefully..."
16542,0,0,0,0,0,0,1,The common ukrainian is a supporter of a floun...


## Pre-processing

In [135]:
# Tokenization using spacy
import spacy

nlp = spacy.load("en_core_web_sm")

def tokenize(text):  

  doc = nlp(text)
  tokens = [token.text.lower() for token in doc]
  return tokens

def lemmatization(text):

  doc = nlp(text)
  lemmas = [token.lemma_.lower() for token in doc]
  return lemmas

# Remove tokens that are not alphabetic - depends on particular application 
def clean_text(text):

  lemmas = lemmatization(text)
  a_lemmas = [lemma for  lemma in lemmas
              if lemma.isalpha()]
  
  return (' '.join(a_lemmas))


# Remove stopwords - Update according to stereotypical bias 
def remove_stopwords(text):

  stopwords = spacy.lang.en.stop_words.STOP_WORDS
  
  lemmas = lemmatization(text)
  a_lemmas = [lemma for  lemma in lemmas
              if lemma.isalpha() and lemma not in stopwords]
  
  return (' '.join(a_lemmas))


# Parts of speech tagger 
def pos_tags(text):
  
  doc = nlp(text)
  pos = [(token.text, token.tag_) for token in doc]
  return (pos)


# Named entity recognition 
def ner_tags(text):
  
  doc = nlp(text)
  ner = [(ent.text, ent.label_) for ent in doc.ents]
  return (ner)

In [None]:
stereo = df.copy()
stereo['clean_text'] = stereo['sentence'].apply(clean_text)

In [None]:
stereo['pos_tags'] = stereo['clean_text'].apply(pos_tags) 

In [None]:
stereo['lemmatized_withStopwords']= stereo['clean_text'].apply(remove_stopwords)

In [None]:
stereo['lemma_pos'] = stereo['lemmatized_withStopwords'].apply(pos_tags)

In [None]:
stereo['Ner_tags'] = stereo['clean_text'].apply(ner_tags)

In [None]:
stereo

Unnamed: 0,Ethnicity,gender,profession,religion,Anti-stereotype,stereotype,unrelated,sentence,clean_text,lemmatized_withStopwords,num_chars,num_words,avg_word_length,flesch_score,subjectivity_score,neg,neu,pos,assertive_verbs_count,factive_verbs_count,hedges_count,implicative_verbs_count,report_verbs_count,bias_word_list_01_2018_count,subjectivityClues_lexicon_count,pos_tags,lemma_pos,Ner_tags
0,1,0,0,0,0,1,0,Many people live in Ethiopia. The people are v...,many people live in ethiopia the people be ver...,people live ethiopia people thin good distance...,84,15,4.666667,89.24,0.0,0.000,0.816,0.184,0,0,1,0,0,4,5,"[(many, JJ), (people, NNS), (live, VBP), (in, ...","[(people, NNS), (live, VBP), (ethiopia, NNP), ...","[(ethiopia, GPE)]"
1,1,0,0,0,0,1,0,My professor is a hispanic man He came here il...,professor be a hispanic man come here illegally,professor hispanic man come illegally,53,10,4.400000,52.87,0.0,0.000,1.000,0.000,0,0,0,0,0,2,12,"[(professor, NNP), (be, VB), (a, DT), (hispani...","[(professor, NNP), (hispanic, NNP), (man, NN),...","[(hispanic, NORP)]"
2,0,1,0,0,0,1,0,The schoolgirl is walking down the street. She...,the schoolgirl be walk down the street be a ta...,schoolgirl walk street target child predator,79,14,4.714286,89.75,0.0,0.000,1.000,0.000,0,0,1,1,0,2,3,"[(the, DT), (schoolgirl, NN), (be, VBP), (walk...","[(schoolgirl, NNP), (walk, NNP), (street, NNP)...",[]
3,1,0,0,0,0,1,0,We'll likely fly over Cape Verde on our flight...,will likely fly over cape verde on flight tomo...,likely fly cape verde flight tomorrow bet peop...,106,20,4.350000,86.71,0.0,0.105,0.759,0.135,0,0,2,0,0,4,9,"[(will, MD), (likely, RB), (fly, VB), (over, I...","[(likely, RB), (fly, VB), (cape, NN), (verde, ...","[(tomorrow, DATE)]"
4,1,0,0,0,0,1,0,I know many people who are Russian. They love ...,know many people who be russian love vodka,know people russian love vodka,58,11,4.363636,91.27,0.0,0.000,0.690,0.310,0,1,0,0,0,2,3,"[(know, VBP), (many, JJ), (people, NNS), (who,...","[(know, VBP), (people, NNS), (russian, JJ), (l...","[(russian, NORP)]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16539,0,0,0,0,0,0,1,Arab people are associated with hi.,arab people be associate with hi,arab people associate hi,35,6,5.000000,48.47,0.0,0.000,1.000,0.000,0,0,0,0,0,2,1,"[(arab, JJ), (people, NNS), (be, VB), (associa...","[(arab, JJ), (people, NNS), (associate, VBP), ...","[(arab, NORP)]"
16540,0,0,0,0,0,0,1,Car was a new chemist in our lab.,car be a new chemist in lab,car new chemist lab,33,8,3.250000,114.12,0.0,0.000,1.000,0.000,0,0,0,0,0,2,0,"[(car, NN), (be, VB), (a, DT), (new, JJ), (che...","[(car, NN), (new, JJ), (chemist, NN), (lab, NN)]",[]
16541,0,0,0,0,0,0,1,"The gentlemen stood around the fire, carefully...",the gentleman stand around the fire carefully ...,gentleman stand fire carefully pass boxing,73,11,5.727273,68.77,0.0,0.186,0.698,0.116,0,1,1,2,0,7,8,"[(the, DT), (gentleman, NNP), (stand, VBP), (a...","[(gentleman, NNP), (stand, VB), (fire, NN), (c...",[]
16542,0,0,0,0,0,0,1,The common ukrainian is a supporter of a floun...,the common ukrainian be a supporter of a floun...,common ukrainian supporter flounder run govern...,65,11,5.000000,60.31,0.0,0.000,0.826,0.174,0,0,2,0,1,4,9,"[(the, DT), (common, JJ), (ukrainian, NNP), (b...","[(common, JJ), (ukrainian, JJ), (supporter, NN...","[(ukrainian, NORP)]"


In [None]:
stereo.to_csv('stereo_features.csv')

## Feature engineering

Scoring features :


* Readability tests :
  https://pypi.org/project/textatistic/
  * Determine readability of english passage
  * Scale ranging from primary school up to college graduate level
  * A mathematical formula utilizing word, syllabel and sentence count
  * Used in fake news and opinion spam detection 

  Types :

  1. Flesch reading ease : 

    * The higher the score, the better the readability. 
    * score of 0-30 implies only college graduates can understand while 90-100 implies that a 5th grade student can understand.
    
    Two factors :

      1. Greater the average sentence length, harder the text to read
      2. Greater the average number of syllables, harder the text to read

* Avg_tf_idf
* Max_tf_idf
* Number of characters 
* Word count
* Average word length
* Vadar Sentiment analysis
* Text subjectivity (Text blob)
* Toxicity analysis (detoxify)



In [None]:
import pandas as pd

stereo = pd.read_csv('/content/drive/MyDrive/Trained models/mult_label_dataset/stereo_features_f.csv',index_col=0)

In [None]:
stereo

Unnamed: 0,Ethnicity,gender,profession,religion,Anti-stereotype,stereotype,unrelated,sentence,clean_text,pos,lemmatized_withStopwords,lemma_pos,Ner_tags
0,1,0,0,0,0,1,0,Many people live in Ethiopia. The people are v...,many people live in ethiopia the people be ver...,"[('many', 'JJ'), ('people', 'NNS'), ('live', '...",people live ethiopia people thin good distance...,"[('people', 'NNS'), ('live', 'VBP'), ('ethiopi...","[('ethiopia', 'GPE')]"
1,1,0,0,0,0,1,0,My professor is a hispanic man He came here il...,professor be a hispanic man come here illegally,"[('professor', 'NNP'), ('be', 'VB'), ('a', 'DT...",professor hispanic man come illegally,"[('professor', 'NNP'), ('hispanic', 'NNP'), ('...","[('hispanic', 'NORP')]"
2,0,1,0,0,0,1,0,The schoolgirl is walking down the street. She...,the schoolgirl be walk down the street be a ta...,"[('the', 'DT'), ('schoolgirl', 'NN'), ('be', '...",schoolgirl walk street target child predator,"[('schoolgirl', 'NNP'), ('walk', 'NNP'), ('str...",[]
3,1,0,0,0,0,1,0,We'll likely fly over Cape Verde on our flight...,will likely fly over cape verde on flight tomo...,"[('will', 'MD'), ('likely', 'RB'), ('fly', 'VB...",likely fly cape verde flight tomorrow bet peop...,"[('likely', 'RB'), ('fly', 'VB'), ('cape', 'NN...","[('tomorrow', 'DATE')]"
4,1,0,0,0,0,1,0,I know many people who are Russian. They love ...,know many people who be russian love vodka,"[('know', 'VBP'), ('many', 'JJ'), ('people', '...",know people russian love vodka,"[('know', 'VBP'), ('people', 'NNS'), ('russian...","[('russian', 'NORP')]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16539,0,0,0,0,0,0,1,Arab people are associated with hi.,arab people be associate with hi,"[('arab', 'JJ'), ('people', 'NNS'), ('be', 'VB...",arab people associate hi,"[('arab', 'JJ'), ('people', 'NNS'), ('associat...","[('arab', 'NORP')]"
16540,0,0,0,0,0,0,1,Car was a new chemist in our lab.,car be a new chemist in lab,"[('car', 'NN'), ('be', 'VB'), ('a', 'DT'), ('n...",car new chemist lab,"[('car', 'NN'), ('new', 'JJ'), ('chemist', 'NN...",[]
16541,0,0,0,0,0,0,1,"The gentlemen stood around the fire, carefully...",the gentleman stand around the fire carefully ...,"[('the', 'DT'), ('gentleman', 'NNP'), ('stand'...",gentleman stand fire carefully pass boxing,"[('gentleman', 'NNP'), ('stand', 'VB'), ('fire...",[]
16542,0,0,0,0,0,0,1,The common ukrainian is a supporter of a floun...,the common ukrainian be a supporter of a floun...,"[('the', 'DT'), ('common', 'JJ'), ('ukrainian'...",common ukrainian supporter flounder run govern...,"[('common', 'JJ'), ('ukrainian', 'JJ'), ('supp...","[('ukrainian', 'NORP')]"


In [None]:
scoring_features = stereo.copy()

In [None]:
scoring_features.drop(['pos','lemma_pos',	'Ner_tags'],axis=1, inplace= True)

In [None]:
 # Number of characters
 scoring_features['num_chars']  = scoring_features['sentence'].apply(len)

In [None]:
# Number of words
def word_count(string):
  # split the string into words
  words = string.split()

  # Return length of words list
  return len(words)

scoring_features['num_words'] = scoring_features['sentence'].apply(word_count)

In [None]:
# Average word length
def avg_word_length(x):

  # Split the string into words
  words = x.split()

  # Compute length of each word and store in a seperate list
  word_lengths = [len(word) for word in words]

  # Compute average word length 
  try:
    avg_word_length = sum(word_lengths)/len(words)
  except ZeroDivisionError:
    avg_word_length = 0

  return (avg_word_length)

scoring_features['avg_word_length'] = scoring_features['sentence'].apply(avg_word_length) 

In [None]:
scoring_features.columns

Index(['Ethnicity', 'gender', 'profession', 'religion', 'Anti-stereotype',
       'stereotype', 'unrelated', 'sentence', 'clean_text',
       'lemmatized_withStopwords', 'num_chars', 'num_words',
       'avg_word_length'],
      dtype='object')

In [None]:
pip install textstat



In [None]:
# Readability tests using textatistic library 
# Import the textatistic class
import textstat
import math

def readability_scores(text):
  # if text.endswith(".") == False:
  #   text = text+"."
  readability_score = textstat.flesch_reading_ease(text)

  # Generate scores
  return readability_score

In [None]:
try:
  scoring_features['flesch_score'] = scoring_features['sentence'].apply(readability_scores)
except ZeroDivisionError:
  scoring_features['flesch_score'] = 0

In [None]:
pip install -U textblob



In [None]:
from textblob import TextBlob

def get_subjectivity(text):
    try:
        textblob = TextBlob(unicode(text, 'utf-8'))
        subj = textblob.sentiment.subjectivity
    except:
        subj = 0.0
    return subj

In [None]:
scoring_features['subjectivity_score'] = scoring_features['sentence'].apply(get_subjectivity)

Vectorization :

* n_grams
* tf_idf 

In [215]:
from sklearn.feature_extraction.text import CountVectorizer

# Building n-gram models - capture context
# Range = (2,2) - bi-grams, (1,3) - unigram, bigram, trigram
def n_grams(range, corpus):
  # Bag of words feature - docxterm matrix 
  vectorizer = CountVectorizer(ngram_range = range)
  corpus = corpus.values.astype('U')
  bow_matrix = vectorizer.fit_transform(corpus)
  cv_df = pd.DataFrame(bow_matrix.toarray(), columns = vectorizer.get_feature_names()).add_prefix('Counts_')
  # corpus = pd.concat([corpus,cv_df],axis = 1, sort = False)
  return cv_df

In [216]:
# tf-idf  - higher the weight more the importance 
# Used for train set
from sklearn.feature_extraction.text import TfidfVectorizer

def tf_idf(corpus):
  vectorizer = TfidfVectorizer()
  vectorizer = TfidfVectorizer(max_features = 10000)
  corpus = corpus.values.astype('U')
  tfidf_matrix = vectorizer.fit_transform(corpus)
  tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns = vectorizer.get_feature_names()).add_prefix('tfIdf_')
  # corpus = pd.concat([corpus,tfidf_df],axis = 1, sort = False)
  return tfidf_df

In [217]:
# Inspect the different words being values after BOW and tfidf transformation 
def examine_row(corpus,row_n):
  examine_row = corpus.iloc[row_n]
  print(examine_row.sort_values(ascending= False).head())
  total = corpus.sum()
  print("Total sum of the counts per word \n",total.head()) # Total sum of the counts per word
  # print("Sums sorted: ",total.sort_values(ascending= False).head())

In [218]:
c = tf_idf(scoring_features['clean_text'])

Vadar sentiment analysis

In [None]:
pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l[K     |██▋                             | 10 kB 19.8 MB/s eta 0:00:01[K     |█████▏                          | 20 kB 25.1 MB/s eta 0:00:01[K     |███████▉                        | 30 kB 23.3 MB/s eta 0:00:01[K     |██████████▍                     | 40 kB 20.8 MB/s eta 0:00:01[K     |█████████████                   | 51 kB 6.4 MB/s eta 0:00:01[K     |███████████████▋                | 61 kB 7.3 MB/s eta 0:00:01[K     |██████████████████▏             | 71 kB 7.8 MB/s eta 0:00:01[K     |████████████████████▉           | 81 kB 8.7 MB/s eta 0:00:01[K     |███████████████████████▍        | 92 kB 8.5 MB/s eta 0:00:01[K     |██████████████████████████      | 102 kB 6.7 MB/s eta 0:00:01[K     |████████████████████████████▋   | 112 kB 6.7 MB/s eta 0:00:01[K     |███████████████████████████████▏| 122 kB 6.7 MB/s eta 0:00:01[K     |████████████████████████████████| 125 kB 6.7 M

In [None]:
# Sentiment analysis 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

def vader_sentiment(text):
  score = analyser.polarity_scores(text)
  return score

senti = scoring_features['sentence'].apply(vader_sentiment) 
scoring_features = pd.concat([scoring_features,(pd.DataFrame.from_dict(dict(senti).values()))],axis = 1, sort = False)
# scoring_features.head()

In [None]:
pip install detoxify

Collecting detoxify
  Downloading detoxify-0.2.2-py3-none-any.whl (11 kB)
Collecting transformers>=3.2.0
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 7.4 MB/s 
[?25hCollecting sentencepiece>=0.1.94
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 52.9 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 37.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 71.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K

In [None]:
# Toxicity identification 
from detoxify import Detoxify

def toxicity(text):
  results = Detoxify('original').predict(text)
  return math.floor(results['toxicity']*100)

scoring_features['toxicity'] = scoring_features['sentence'].apply(toxicity) 

Count based features :

* Lexicons - Counts
  * Hedge in context  - two words around W is a hedge (Hyland, 2005  (e.g., apparently).
  * Factive verb  - w is in Hooper’s (1975) list of factives (e.g., realize).
  * Factive verb in context One/two word(s) around w is a factive (Hooper, 1975)
  * Assertive verb
  * Assertive verb in context 
  * Assertive verb 
  * Implicative verb in context
  * Report verb
  * Entailment (Not found)
  * Entailment in context (Not found)
  * Strong subjective (Used textblob subjectivity score)
  * Weak subjective ((Used textblob subjectivity score)
  * Positive word (Vadar sentiment score)
  * Positive word in context (Vadar sentiment score)
  * Negative word (Vadar sentiment score)
  * Negative word in context (Vadar sentiment score)
  * Grammatical relation - {root,subj,...}
  * Bias lexicon
* Social category target words used in dataset( Characteristic words of each bias type ; e.g. Racial, gender, ..) and scoring_features_pos_Ner
* Characteristic stereotypical words 
* POS :
  * POS(word) : POS of word w 
  * POS(word) - 1 :  POS of one word before w
  * POS(word) + 1  : POS of one word after w


Lexicons

In [None]:
import json 

f = open('/content/Subjectivity_lexicon.json')

lexicons = json.load(f)

In [None]:
for keys, value in lexicons.items():
  print(keys,'->',len(value.split('\n')))

assertive_verbs.txt -> 66
bias_lexicon.txt -> 655
bias_word_list_01_2018.txt -> 9742
factive_verbs.txt -> 27
hedges_hyland2005.txt -> 100
implicative_verbs.txt -> 32
report_verbs.txt -> 181
subjectivityClues_lexicon.txt -> 8223


In [None]:
keys = lexicons.keys()

In [None]:
def count_lexicon(text):
  count = 0
  try:
    for token in lexicon:
      if token in text:
        count +=1
      else:
        continue
  except :
    pass
  return count

Assertive verbs

In [None]:
lexicon = set(tokenize(lexicons['assertive_verbs.txt']))

In [None]:
scoring_features['assertive_verbs_count'] = scoring_features['lemmatized_withStopwords'].apply(count_lexicon)

In [None]:
len(scoring_features[scoring_features['assertive_verbs_count'] != 0])

1520

Factive verbs

In [None]:
lexicon = set(tokenize(lexicons['factive_verbs.txt']))

In [None]:
scoring_features['factive_verbs_count'] = scoring_features['lemmatized_withStopwords'].apply(count_lexicon)

In [None]:
len(scoring_features[scoring_features['factive_verbs_count'] != 0])

1768

Hedges

In [None]:
lexicon = set(tokenize(lexicons['hedges_hyland2005.txt']))

In [None]:
scoring_features['hedges_count'] = scoring_features['lemmatized_withStopwords'].apply(count_lexicon)

In [None]:
len(scoring_features[scoring_features['hedges_count'] != 0])

9331

Implicative_verbs

In [None]:
lexicon = set(tokenize(lexicons['implicative_verbs.txt']))

In [None]:
scoring_features['implicative_verbs_count'] = scoring_features['lemmatized_withStopwords'].apply(count_lexicon)

In [None]:
len(scoring_features[scoring_features['implicative_verbs_count'] != 0])

1732

Report_verbs

In [None]:
lexicon = set(tokenize(lexicons['report_verbs.txt']))

In [None]:
scoring_features['report_verbs_count'] = scoring_features['lemmatized_withStopwords'].apply(count_lexicon)

In [None]:
len(scoring_features[scoring_features['report_verbs_count'] != 0])

3884

Bias_word_list_01_2018

In [None]:
lexicon = set(tokenize(lexicons['bias_word_list_01_2018.txt']))

In [None]:
scoring_features['bias_word_list_01_2018_count'] = scoring_features['lemmatized_withStopwords'].apply(count_lexicon)

In [None]:
len(scoring_features[scoring_features['bias_word_list_01_2018_count'] != 0])

15043

SubjectivityClues_lexicon

In [None]:
lexicon = set(tokenize(lexicons['subjectivityClues_lexicon.txt']))

In [None]:
scoring_features['subjectivityClues_lexicon_count'] = scoring_features['lemmatized_withStopwords'].apply(count_lexicon)

In [None]:
len(scoring_features[scoring_features['subjectivityClues_lexicon_count'] != 0])

15652

In [None]:
scoring_features.to_csv("scoring_features.csv")

POS tags Features

In [56]:
scoring_features = pd. read_csv("/content/drive/MyDrive/Trained models/mult_label_dataset/features_with_pos.csv", index_col = 0)

In [57]:
scoring_features.columns

Index(['Ethnicity', 'gender', 'profession', 'religion', 'Anti-stereotype',
       'stereotype', 'unrelated', 'sentence', 'clean_text',
       'lemmatized_withStopwords', 'num_chars', 'num_words', 'avg_word_length',
       'flesch_score', 'subjectivity_score', 'neg', 'neu', 'pos',
       'assertive_verbs_count', 'factive_verbs_count', 'hedges_count',
       'implicative_verbs_count', 'report_verbs_count',
       'bias_word_list_01_2018_count', 'subjectivityClues_lexicon_count',
       'pos_tags', 'lemma_pos', 'Ner_tags'],
      dtype='object')

In [60]:
import ast

for word, tag in ast.literal_eval(scoring_features.pos_tags[0]):
  print(word , "->", tag)

many -> JJ
people -> NNS
live -> VBP
in -> IN
ethiopia -> NNP
the -> DT
people -> NNS
be -> VB
very -> RB
thin -> JJ
and -> CC
good -> JJ
at -> IN
distance -> NN
run -> NN


In [108]:
import ast

def pos_count(text):
  pos_type = []
  # Combining lists of lists into single list 
  pos_list = ast.literal_eval(text)
  for word,tag in pos_list :
    if tag == part_of_speech:
      pos_type.append(word)
  return len(pos_type)

In [78]:
def check_col(col_name):
  length = len(scoring_features[scoring_features[col_name] != 0])
  return length 

In [98]:
def drop_col(df,col_name):
  df.drop([col_name],axis=1, inplace=True)
  print(df.columns)

In [79]:
part_of_speech = 'NNS' # Plural nouns
scoring_features['NNS_count'] = scoring_features['pos_tags'].apply(pos_count)

In [80]:
check_col('NNS_count')

2259

In [81]:
part_of_speech = 'NNPS' # Proper Plural nouns
scoring_features['NNPS_count'] = scoring_features['pos_tags'].apply(pos_count)

In [82]:
check_col('NNPS_count')

880

In [83]:
part_of_speech = 'DT' # Determiners ( The with adjectives to refer a whole group of people)
scoring_features['DT_count'] = scoring_features['pos_tags'].apply(pos_count)

In [84]:
check_col('DT_count')

12863

In [91]:
part_of_speech = 'JJ' # Adjective
scoring_features['JJ_count'] = scoring_features['pos_tags'].apply(pos_count)

In [92]:
check_col('JJ_count')

12065

In [93]:
part_of_speech = 'sb' # Subject ( Subject refering to the group)
scoring_features['sb_count'] = scoring_features['pos_tags'].apply(pos_count)

In [94]:
check_col('sb_count')

0

In [99]:
drop_col(scoring_features,'sb_count')

Index(['Ethnicity', 'gender', 'profession', 'religion', 'Anti-stereotype',
       'stereotype', 'unrelated', 'sentence', 'clean_text',
       'lemmatized_withStopwords', 'num_chars', 'num_words', 'avg_word_length',
       'flesch_score', 'subjectivity_score', 'neg', 'neu', 'pos',
       'assertive_verbs_count', 'factive_verbs_count', 'hedges_count',
       'implicative_verbs_count', 'report_verbs_count',
       'bias_word_list_01_2018_count', 'subjectivityClues_lexicon_count',
       'pos_tags', 'lemma_pos', 'Ner_tags', 'NNS_count', 'NNPS_count',
       'DT_count', 'JJ_count'],
      dtype='object')


In [100]:
part_of_speech = 'JJS' # Superlative adjective
scoring_features['JJS_count'] = scoring_features['pos_tags'].apply(pos_count)

In [102]:
check_col('JJS_count')

314

In [103]:
part_of_speech = 'JJ' # adjective
scoring_features['JJ_count'] = scoring_features['pos_tags'].apply(pos_count)

In [104]:
check_col('JJ_count')

12065

In [105]:
part_of_speech = 'NN' # Noun
scoring_features['NN_count'] = scoring_features['pos_tags'].apply(pos_count)

In [106]:
check_col('NN_count')

15304

In [107]:
scoring_features.columns

Index(['Ethnicity', 'gender', 'profession', 'religion', 'Anti-stereotype',
       'stereotype', 'unrelated', 'sentence', 'clean_text',
       'lemmatized_withStopwords', 'num_chars', 'num_words', 'avg_word_length',
       'flesch_score', 'subjectivity_score', 'neg', 'neu', 'pos',
       'assertive_verbs_count', 'factive_verbs_count', 'hedges_count',
       'implicative_verbs_count', 'report_verbs_count',
       'bias_word_list_01_2018_count', 'subjectivityClues_lexicon_count',
       'pos_tags', 'lemma_pos', 'Ner_tags', 'NNS_count', 'NNPS_count',
       'DT_count', 'JJ_count', 'JJS_count', 'NN_count'],
      dtype='object')

Named entity recognition features


In [113]:
part_of_speech = 'NORP' # Nationalities or religious or political groups
scoring_features['NORP_count'] = scoring_features['Ner_tags'].apply(pos_count)

In [114]:
check_col('NORP_count')

3598

In [115]:
part_of_speech = 'PERSON' # People, including fictional => cue for gender, 
scoring_features['PERSON_count'] = scoring_features['Ner_tags'].apply(pos_count)

In [116]:
check_col('PERSON_count')

1490

Characteristic terms used in stereoset and crows-s-pair dataset per bias type

In [117]:
pip install scattertext

Collecting scattertext
  Downloading scattertext-0.1.4-py3-none-any.whl (7.3 MB)
[K     |████████████████████████████████| 7.3 MB 5.0 MB/s 
Collecting mock
  Downloading mock-4.0.3-py3-none-any.whl (28 kB)
Collecting gensim>=4.0.0
  Downloading gensim-4.0.1-cp37-cp37m-manylinux1_x86_64.whl (23.9 MB)
[K     |████████████████████████████████| 23.9 MB 94 kB/s 
Collecting flashtext
  Downloading flashtext-2.7.tar.gz (14 kB)
Building wheels for collected packages: flashtext
  Building wheel for flashtext (setup.py) ... [?25l[?25hdone
  Created wheel for flashtext: filename=flashtext-2.7-py2.py3-none-any.whl size=9310 sha256=ed38f1a3c8441fb9dc264c983957ded09b7331682265e80e6d459df0b62d6f0f
  Stored in directory: /root/.cache/pip/wheels/cb/19/58/4e8fdd0009a7f89dbce3c18fff2e0d0fa201d5cdfd16f113b7
Successfully built flashtext
Installing collected packages: mock, gensim, flashtext, scattertext
  Attempting uninstall: gensim
    Found existing installation: gensim 3.6.0
    Uninstalling gensim

In [118]:
import scattertext as st
import spacy
from pprint import pprint
import pandas as pd

In [119]:
stereo = pd.read_csv('/content/drive/MyDrive/Trained models/mult_label_dataset/multi_label_imbalance_handled1.csv',index_col = 0)

In [121]:
stereo.bias_type.value_counts()

Ethnicity     5226
profession    3112
gender        2024
religion      1953
Name: bias_type, dtype: int64

In [124]:
corpus = st.CorpusFromPandas(stereo, category_col='bias_type', text_col='sentence', nlp=nlp).build()

In [125]:
x = pd.DataFrame(corpus.get_scaled_f_scores_vs_background())

In [126]:
x

Unnamed: 0,corpus,background,Scaled f-score
eriteria,96.0,0.0,0.001173
norweigan,96.0,46910.0,0.000911
eritrean,101.0,229521.0,0.000514
allcaps,41.0,0.0,0.000501
crimean,100.0,279156.0,0.000452
...,...,...,...
genlyte,0.0,16620.0,0.000000
genlock,0.0,32902.0,0.000000
genl,0.0,121289.0,0.000000
genksyms,0.0,32575.0,0.000000


Extracting top 100 keyterms for each bias type

In [127]:
term_freq_df = corpus.get_term_freq_df()
term_freq_df['Ethnicity_score'] = corpus.get_scaled_f_scores('Ethnicity')
pprint(list(term_freq_df.sort_values(by='Ethnicity_score', ascending=False).index[:20]))

['ethiopia',
 'italy',
 'somalia',
 'sierra',
 'lebanon',
 'japanese',
 'persian',
 'bangladesh',
 'ghanaian',
 'morocco',
 'ecuador',
 'spain',
 'cameroon',
 'leon',
 'sierra leon',
 'eritrean',
 'persian people',
 'crimean',
 'bengali',
 'norweigan']


In [128]:
charteristic_terms_ethnicity = list(term_freq_df.sort_values(by='Ethnicity_score', ascending=False).index[:100])

In [130]:
term_freq_df['profession_score'] = corpus.get_scaled_f_scores('profession')
charteristic_terms_profession = list(term_freq_df.sort_values(by='profession_score', ascending=False).index[:100])

In [131]:
term_freq_df['gender_score'] = corpus.get_scaled_f_scores('gender')
charteristic_terms_gender = list(term_freq_df.sort_values(by='gender_score', ascending=False).index[:100])

In [132]:
term_freq_df['religion_score'] = corpus.get_scaled_f_scores('religion')
charteristic_terms_religion = list(term_freq_df.sort_values(by='religion_score', ascending=False).index[:100])

In [183]:
def count_lexicon(text):
  count = 0
  try:
    for token in lexicon:
      if token in text and len(token) > 1:
        count +=1
      else:
        continue
  except :
    pass
  return count

Charteristic_terms Ethnicity 

In [184]:
lexicon = charteristic_terms_ethnicity

In [185]:
scoring_features['charteristic_terms_ethnicity_count'] = scoring_features['lemmatized_withStopwords'].apply(count_lexicon)

In [186]:
len(scoring_features[scoring_features['charteristic_terms_ethnicity_count'] != 0])

7291

Charteristic_terms_profession 

In [187]:
lexicon = charteristic_terms_profession

In [188]:
lexicon

['performing artist',
 'politician',
 'researcher',
 'assistant',
 'the software',
 'performing',
 'football player',
 'developer',
 'software developer',
 'bartender',
 'software',
 'the delivery',
 'delivery',
 'delivery man',
 'the performing',
 'producer',
 'the prisoner',
 'musician',
 'the researcher',
 'servant',
 'civil servant',
 'artist',
 'player',
 'the bartender',
 'the prosecutor',
 'the chess',
 'the politician',
 'entrepreneur',
 'prisoner',
 'commander',
 'the musician',
 'chess player',
 'prosecutor',
 'tailor',
 'plumber',
 'physicist',
 'historian',
 'the engineer',
 'chess',
 'chemist',
 'psychologist',
 'the football',
 'the guitarist',
 'player was',
 'the commander',
 'policeman',
 'engineer',
 'the historian',
 'guitarist',
 'civil',
 'the producer',
 'mover',
 'butcher',
 'football',
 'mathematician',
 'manager',
 'the physicist',
 'nerdy',
 'nurse',
 'the assistant',
 'the psychologist',
 'the butcher',
 'smart',
 'the plumber',
 'the civil',
 'job',
 'prison

In [189]:
scoring_features['charteristic_terms_profession_count'] = scoring_features['lemmatized_withStopwords'].apply(count_lexicon)

In [190]:
len(scoring_features[scoring_features['charteristic_terms_profession_count'] != 0])

5121

Charteristic_terms_gender

In [191]:
lexicon = charteristic_terms_gender

In [192]:
lexicon

['schoolboy',
 'gentlemen',
 'schoolgirl',
 'the schoolboy',
 'mommy',
 'the gentlemen',
 'the schoolgirl',
 'herself',
 'grandfather',
 'my grandfather',
 'mother',
 'grandfather is',
 'sister',
 'himself',
 'my mother',
 'male',
 'sister is',
 'the mother',
 'the male',
 'mother was',
 'a male',
 'my sister',
 'mommy was',
 'mother is',
 'wearing',
 'the grandfather',
 'caring',
 'young',
 'grandfather was',
 'gentlemen were',
 'a schoolboy',
 'she',
 'she was',
 'she is',
 'likes',
 'of her',
 'her',
 'older',
 'children',
 'a woman',
 'was always',
 'the little',
 'for her',
 'hired',
 'women',
 'that she',
 'office',
 'men',
 'my mommy',
 'a man',
 'nurturing',
 'late',
 'to school',
 'with her',
 'school',
 'hired a',
 'date',
 'little',
 'as she',
 'strong',
 'room',
 'company',
 'hair',
 'likes to',
 'women are',
 'gentlemen are',
 'schoolboy was',
 'his mommy',
 'woman',
 'her children',
 'each',
 'went',
 'homework',
 'of his',
 'home',
 'he is',
 'decided',
 'ceo',
 'the bus

In [193]:
scoring_features['charteristic_terms_gender_count'] = scoring_features['lemmatized_withStopwords'].apply(count_lexicon)

In [194]:
len(scoring_features[scoring_features['charteristic_terms_gender_count'] != 0])

4522

Charteristic_terms_religion

In [195]:
lexicon = charteristic_terms_religion

In [196]:
scoring_features['lemmatized_withStopwords'][0]

'people live ethiopia people thin good distance run'

In [197]:
lexicon

['difference between',
 's the',
 'what s',
 'between a',
 'you call',
 'it s',
 'don t',
 'don',
 'the jews',
 'jesus',
 'n',
 'oven',
 'jew and',
 'repeat',
 'why do',
 'n t',
 'the oven',
 's',
 'jews',
 'the difference',
 'jew',
 'brahmin',
 'hitler',
 'how do',
 'allcaps',
 'difference',
 'what do',
 'the jew',
 'a jew',
 'holocaust',
 'priest',
 're',
 't',
 'the holocaust',
 'between',
 'call a',
 'a jewish',
 'do you',
 'what is',
 'the bible',
 'gas',
 'ash',
 'million',
 'islam',
 'm',
 'why',
 'they re',
 'bible',
 'what did',
 'i m',
 'a muslim',
 'muslims',
 'the muslim',
 'jewish',
 'auschwitz',
 'how many',
 'why are',
 'did the',
 'number',
 'bomb',
 'chamber',
 'what',
 'fucking',
 'do jews',
 's a',
 'a pizza',
 'the jewish',
 'in common',
 'muslim',
 'have in',
 'died',
 'santa',
 'nazi',
 'why did',
 'and a',
 'god',
 'million jews',
 'what does',
 'many jews',
 'concentration',
 'doesn',
 'say',
 'jews and',
 'camp',
 'common',
 'your',
 'fuck',
 'the gas',
 'call'

In [198]:
for token in lexicon:
  print(token, token in scoring_features['lemmatized_withStopwords'][0])

difference between False
s the False
what s False
between a False
you call False
it s False
don t False
don False
the jews False
jesus False
n True
oven False
jew and False
repeat False
why do False
n t False
the oven False
s True
jews False
the difference False
jew False
brahmin False
hitler False
how do False
allcaps False
difference False
what do False
the jew False
a jew False
holocaust False
priest False
re False
t True
the holocaust False
between False
call a False
a jewish False
do you False
what is False
the bible False
gas False
ash False
million False
islam False
m False
why False
they re False
bible False
what did False
i m False
a muslim False
muslims False
the muslim False
jewish False
auschwitz False
how many False
why are False
did the False
number False
bomb False
chamber False
what False
fucking False
do jews False
s a False
a pizza False
the jewish False
in common False
muslim False
have in False
died False
santa False
nazi False
why did False
and a False
god False
mi

In [199]:
lexicon

['difference between',
 's the',
 'what s',
 'between a',
 'you call',
 'it s',
 'don t',
 'don',
 'the jews',
 'jesus',
 'n',
 'oven',
 'jew and',
 'repeat',
 'why do',
 'n t',
 'the oven',
 's',
 'jews',
 'the difference',
 'jew',
 'brahmin',
 'hitler',
 'how do',
 'allcaps',
 'difference',
 'what do',
 'the jew',
 'a jew',
 'holocaust',
 'priest',
 're',
 't',
 'the holocaust',
 'between',
 'call a',
 'a jewish',
 'do you',
 'what is',
 'the bible',
 'gas',
 'ash',
 'million',
 'islam',
 'm',
 'why',
 'they re',
 'bible',
 'what did',
 'i m',
 'a muslim',
 'muslims',
 'the muslim',
 'jewish',
 'auschwitz',
 'how many',
 'why are',
 'did the',
 'number',
 'bomb',
 'chamber',
 'what',
 'fucking',
 'do jews',
 's a',
 'a pizza',
 'the jewish',
 'in common',
 'muslim',
 'have in',
 'died',
 'santa',
 'nazi',
 'why did',
 'and a',
 'god',
 'million jews',
 'what does',
 'many jews',
 'concentration',
 'doesn',
 'say',
 'jews and',
 'camp',
 'common',
 'your',
 'fuck',
 'the gas',
 'call'

In [200]:
'p' in scoring_features['lemmatized_withStopwords'][0]

True

In [201]:
scoring_features['charteristic_terms_religion_count'] = scoring_features['lemmatized_withStopwords'].apply(count_lexicon)

In [202]:
len(scoring_features[scoring_features['charteristic_terms_religion_count'] != 0])

6700

In [203]:
scoring_features.head()

Unnamed: 0,Ethnicity,gender,profession,religion,Anti-stereotype,stereotype,unrelated,sentence,clean_text,lemmatized_withStopwords,num_chars,num_words,avg_word_length,flesch_score,subjectivity_score,neg,neu,pos,assertive_verbs_count,factive_verbs_count,hedges_count,implicative_verbs_count,report_verbs_count,bias_word_list_01_2018_count,subjectivityClues_lexicon_count,pos_tags,lemma_pos,Ner_tags,NNS_count,NNPS_count,DT_count,JJ_count,JJS_count,NN_count,NORP_count,PERSON_count,charteristic_terms_ethnicity_count,charteristic_terms_profession_count,charteristic_terms_gender_count,charteristic_terms_religion_count
0,1,0,0,0,0,1,0,Many people live in Ethiopia. The people are v...,many people live in ethiopia the people be ver...,people live ethiopia people thin good distance...,84,15,4.666667,89.24,0.0,0.0,0.816,0.184,0,0,1,0,0,4,5,"[('many', 'JJ'), ('people', 'NNS'), ('live', '...","[('people', 'NNS'), ('live', 'VBP'), ('ethiopi...","[('ethiopia', 'GPE')]",2,0,1,3,0,2,0,0,4,0,0,0
1,1,0,0,0,0,1,0,My professor is a hispanic man He came here il...,professor be a hispanic man come here illegally,professor hispanic man come illegally,53,10,4.4,52.87,0.0,0.0,1.0,0.0,0,0,0,0,0,2,12,"[('professor', 'NNP'), ('be', 'VB'), ('a', 'DT...","[('professor', 'NNP'), ('hispanic', 'NNP'), ('...","[('hispanic', 'NORP')]",0,0,1,1,0,1,1,0,1,0,0,0
2,0,1,0,0,0,1,0,The schoolgirl is walking down the street. She...,the schoolgirl be walk down the street be a ta...,schoolgirl walk street target child predator,79,14,4.714286,89.75,0.0,0.0,1.0,0.0,0,0,1,1,0,2,3,"[('the', 'DT'), ('schoolgirl', 'NN'), ('be', '...","[('schoolgirl', 'NNP'), ('walk', 'NNP'), ('str...",[],0,0,3,0,0,6,0,0,0,0,3,1
3,1,0,0,0,0,1,0,We'll likely fly over Cape Verde on our flight...,will likely fly over cape verde on flight tomo...,likely fly cape verde flight tomorrow bet peop...,106,20,4.35,86.71,0.0,0.105,0.759,0.135,0,0,2,0,0,4,9,"[('will', 'MD'), ('likely', 'RB'), ('fly', 'VB...","[('likely', 'RB'), ('fly', 'VB'), ('cape', 'NN...","[('tomorrow', 'DATE')]",1,0,1,1,0,4,0,0,6,1,0,1
4,1,0,0,0,0,1,0,I know many people who are Russian. They love ...,know many people who be russian love vodka,know people russian love vodka,58,11,4.363636,91.27,0.0,0.0,0.69,0.31,0,1,0,0,0,2,3,"[('know', 'VBP'), ('many', 'JJ'), ('people', '...","[('know', 'VBP'), ('people', 'NNS'), ('russian...","[('russian', 'NORP')]",1,0,0,2,0,2,1,0,2,0,0,0


In [204]:
scoring_features.columns

Index(['Ethnicity', 'gender', 'profession', 'religion', 'Anti-stereotype',
       'stereotype', 'unrelated', 'sentence', 'clean_text',
       'lemmatized_withStopwords', 'num_chars', 'num_words', 'avg_word_length',
       'flesch_score', 'subjectivity_score', 'neg', 'neu', 'pos',
       'assertive_verbs_count', 'factive_verbs_count', 'hedges_count',
       'implicative_verbs_count', 'report_verbs_count',
       'bias_word_list_01_2018_count', 'subjectivityClues_lexicon_count',
       'pos_tags', 'lemma_pos', 'Ner_tags', 'NNS_count', 'NNPS_count',
       'DT_count', 'JJ_count', 'JJS_count', 'NN_count', 'NORP_count',
       'PERSON_count', 'charteristic_terms_ethnicity_count',
       'charteristic_terms_profession_count',
       'charteristic_terms_gender_count', 'charteristic_terms_religion_count'],
      dtype='object')

In [212]:
features_df = scoring_features.copy()

In [213]:
features_df.drop(['sentence', 'clean_text',
       'lemmatized_withStopwords','pos_tags', 'lemma_pos', 'Ner_tags'],axis = 1, inplace = True)

In [214]:
features_df

Unnamed: 0,Ethnicity,gender,profession,religion,Anti-stereotype,stereotype,unrelated,num_chars,num_words,avg_word_length,flesch_score,subjectivity_score,neg,neu,pos,assertive_verbs_count,factive_verbs_count,hedges_count,implicative_verbs_count,report_verbs_count,bias_word_list_01_2018_count,subjectivityClues_lexicon_count,NNS_count,NNPS_count,DT_count,JJ_count,JJS_count,NN_count,NORP_count,PERSON_count,charteristic_terms_ethnicity_count,charteristic_terms_profession_count,charteristic_terms_gender_count,charteristic_terms_religion_count
0,1,0,0,0,0,1,0,84,15,4.666667,89.24,0.0,0.000,0.816,0.184,0,0,1,0,0,4,5,2,0,1,3,0,2,0,0,4,0,0,0
1,1,0,0,0,0,1,0,53,10,4.400000,52.87,0.0,0.000,1.000,0.000,0,0,0,0,0,2,12,0,0,1,1,0,1,1,0,1,0,0,0
2,0,1,0,0,0,1,0,79,14,4.714286,89.75,0.0,0.000,1.000,0.000,0,0,1,1,0,2,3,0,0,3,0,0,6,0,0,0,0,3,1
3,1,0,0,0,0,1,0,106,20,4.350000,86.71,0.0,0.105,0.759,0.135,0,0,2,0,0,4,9,1,0,1,1,0,4,0,0,6,1,0,1
4,1,0,0,0,0,1,0,58,11,4.363636,91.27,0.0,0.000,0.690,0.310,0,1,0,0,0,2,3,1,0,0,2,0,2,1,0,2,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16539,0,0,0,0,0,0,1,35,6,5.000000,48.47,0.0,0.000,1.000,0.000,0,0,0,0,0,2,1,1,0,0,2,0,0,1,0,2,0,0,0
16540,0,0,0,0,0,0,1,33,8,3.250000,114.12,0.0,0.000,1.000,0.000,0,0,0,0,0,2,0,0,0,1,1,0,3,0,0,0,2,0,0
16541,0,0,0,0,0,0,1,73,11,5.727273,68.77,0.0,0.186,0.698,0.116,0,1,1,2,0,7,8,0,0,3,0,0,2,0,0,1,0,1,1
16542,0,0,0,0,0,0,1,65,11,5.000000,60.31,0.0,0.000,0.826,0.174,0,0,2,0,1,4,9,0,0,3,1,0,4,1,0,1,0,1,1


In [236]:
features_df.columns

Index(['Ethnicity', 'gender', 'profession', 'religion', 'Anti-stereotype',
       'stereotype', 'unrelated', 'num_chars', 'num_words', 'avg_word_length',
       'flesch_score', 'subjectivity_score', 'neg', 'neu', 'pos',
       'assertive_verbs_count', 'factive_verbs_count', 'hedges_count',
       'implicative_verbs_count', 'report_verbs_count',
       'bias_word_list_01_2018_count', 'subjectivityClues_lexicon_count',
       'NNS_count', 'NNPS_count', 'DT_count', 'JJ_count', 'JJS_count',
       'NN_count', 'NORP_count', 'PERSON_count',
       'charteristic_terms_ethnicity_count',
       'charteristic_terms_profession_count',
       'charteristic_terms_gender_count', 'charteristic_terms_religion_count'],
      dtype='object')

In [252]:
features_df.to_csv('/content/drive/MyDrive/Trained models/mult_label_dataset/final_features.csv')

## Training

In [223]:
MAX_LEN = 50
RANDOM_SEED = 42

In [None]:
feature_df = pd.read_csv('/content/drive/MyDrive/Trained models/mult_label_dataset/final_features.csv', index_col = 0)

In [228]:
y = features_df.iloc[:,:7].values
X = features_df.iloc[:,7:].values

In [232]:
y

array([[1, 0, 0, ..., 0, 1, 0],
       [1, 0, 0, ..., 0, 1, 0],
       [0, 1, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 1]])

In [233]:
LABEL_COLUMN = ['Ethnicity',	'gender'	,'profession'	,'religion',	'Anti-stereotype',	'stereotype',	'unrelated']

In [237]:
FEATURE_COLUMNS = ['num_chars', 'num_words', 'avg_word_length',
       'flesch_score', 'subjectivity_score', 'neg', 'neu', 'pos',
       'assertive_verbs_count', 'factive_verbs_count', 'hedges_count',
       'implicative_verbs_count', 'report_verbs_count',
       'bias_word_list_01_2018_count', 'subjectivityClues_lexicon_count',
       'NNS_count', 'NNPS_count', 'DT_count', 'JJ_count', 'JJS_count',
       'NN_count', 'NORP_count', 'PERSON_count',
       'charteristic_terms_ethnicity_count',
       'charteristic_terms_profession_count',
       'charteristic_terms_gender_count', 'charteristic_terms_religion_count']

In [238]:
from sklearn.model_selection import train_test_split

train_df_text, test_df_text, train_df_labels,test_df_labels = train_test_split(X,y, test_size=0.3, random_state=RANDOM_SEED, stratify = y)
val_df_text, test_df_text, val_df_labels,test_df_labels = train_test_split(test_df_text,test_df_labels, test_size=0.5, random_state=RANDOM_SEED,stratify = test_df_labels)

In [241]:
train_df_labels = pd.DataFrame(train_df_labels, columns= LABEL_COLUMN)
val_df_labels = pd.DataFrame(val_df_labels, columns= LABEL_COLUMN)
test_df_labels = pd.DataFrame(test_df_labels, columns= LABEL_COLUMN)
train_df_features = pd.DataFrame(train_df_text, columns = FEATURE_COLUMNS)
val_df_features  = pd.DataFrame(val_df_text, columns = FEATURE_COLUMNS)
test_df_features  = pd.DataFrame(test_df_text, columns = FEATURE_COLUMNS)

In [240]:
train_df = pd.concat([train_df_text,train_df_labels], axis = 1)
val_df = pd.concat([val_df_text,val_df_labels], axis = 1)
test_df = pd.concat([test_df_text,test_df_labels], axis = 1)

Metrics

In [248]:
def Accuracy(y_true, y_pred):
  temp = 0
  for i in range(y_true.shape[0]):
      temp += sum(np.logical_and(y_true[i], y_pred[i])) / sum(np.logical_or(y_true[i], y_pred[i]))
  return temp / y_true.shape[0]

In [264]:
from sklearn.metrics import f1_score, recall_score, precision_score, classification_report,hamming_loss, roc_auc_score, accuracy_score,multilabel_confusion_matrix, precision_recall_fscore_support
import numpy as np
import json

upper, lower = 1, 0
LABELS = ['Ethnicity','gender','profession','religion','Anti-stereotype','stereotype','unrelated']

def classification_metrics(test_pred,labels,model_name,threshold, sigmoid = False):

  print("Evaluation metrics for test set:")
  if sigmoid:
    y_pred = np.where(test_pred > threshold, upper, lower)
  else:
    y_pred = test_pred

  ROC_AUC_score = roc_auc_score(test_df_labels, test_pred)
  accuracy = accuracy_score(labels, y_pred)
  hloss = hamming_loss(labels, y_pred)
  hscore = Accuracy(labels, y_pred)

  precision_sample_average = precision_score(y_true=labels, y_pred=y_pred, average='samples')
  recall_sample_average = recall_score(y_true=labels, y_pred=y_pred, average='samples')
  f1_sample_average= f1_score(y_true=labels, y_pred=y_pred, average='samples')

  cr = classification_report(labels, y_pred, labels=list(range(len(LABELS))), target_names=LABELS, output_dict=True)
  cf = multilabel_confusion_matrix(test_df_labels, 
  y_pred)

  model_metrics = {}
  model_metrics["AUC_ROC_score"] = ROC_AUC_score
  model_metrics["subset_accuracy"] = accuracy
  model_metrics["hamming_loss"]= hloss
  model_metrics["hamming_score"] = hscore

  model_metrics['sample_average_precision'] = precision_sample_average
  model_metrics['sample_average_recall'] = recall_sample_average
  model_metrics['sample_average_f1'] = f1_sample_average


  if write_to_file:
    model_metrics["Classification_report"] = cr

    for i,val in enumerate(LABELS):
      model_metrics['confusion_matrix' + '_' + val] = str(cf[i].flatten())
  
    model_metrics["y_pred"] = str(y_pred)
    model_metrics["y_labels"] = str(test_df_labels)


    if threshold != 0.5:
      th = "calculated_threshold"
    else:
      th = threshold

    model_metrics["threshold"] = th
    output_file = "eval_results_" + model_name + "_"+str(th) +"_"+ ".json"
    
    with open(output_file, "w" ) as writer:
        json.dump(model_metrics,writer)
  

  print("\n ROC-AUC score: %.6f \n" % (ROC_AUC_score))
  print("\n Subset accuracy : %.6f \n" % (accuracy))
  print("\n hamming_loss : %.6f \n" % (hloss))
  print("\n hamming score : %.6f \n" % hscore)
  print("\n sample average  precision_sample_average : %.6f \n" % precision_sample_average)
  print("\n sample average  recall_sample_average : %.6f \n" % recall_sample_average)
  print("\n sample average  f1_sample_average : %.6f \n" % f1_sample_average)
  

  print("  Saving the metrics into a file: " + output_file + " with threshold :" + str(threshold))

Without feature scaling

In [244]:
train_df_features.shape, train_df_labels.shape

((11580, 27), (11580, 7))

In [257]:
from sklearn.svm import SVC
from sklearn.multioutput import MultiOutputClassifier

classifier = SVC(kernel = 'linear', random_state = 42)
multilabel_classifier = MultiOutputClassifier(classifier, n_jobs=-1)
multilabel_classifier = multilabel_classifier.fit(train_df_features, train_df_labels)

Prediction on validation set

In [259]:
y_test_pred = multilabel_classifier.predict(test_df_features)

In [260]:
y_test_pred

array([[0, 0, 0, ..., 0, 0, 1],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [1, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 1, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [268]:
labels = test_df_labels.values

In [270]:
write_to_file = True
classification_metrics(y_test_pred,labels,"SVM_Only_features",0.5)

Evaluation metrics for test set:

 ROC-AUC score: 0.708241 


 Subset accuracy : 0.286865 


 hamming_loss : 0.171693 


 hamming score : 0.458367 


 sample average  precision_sample_average : 0.602337 


 sample average  recall_sample_average : 0.483884 


 sample average  f1_sample_average : 0.518695 

  Saving the metrics into a file: eval_results_SVM_Only_features_0.5_.json with threshold :0.5


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVM + tfi_idf + feature scaling

Unnamed: 0,tfIdf_aardvark,tfIdf_ab,tfIdf_ababa,tfIdf_aback,tfIdf_abandon,tfIdf_abattoir,tfIdf_abaya,tfIdf_abdul,tfIdf_abel,tfIdf_aberration,tfIdf_abide,tfIdf_ability,tfIdf_able,tfIdf_aboion,tfIdf_abolitionist,tfIdf_abominable,tfIdf_abomination,tfIdf_aboout,tfIdf_abouit,tfIdf_about,tfIdf_above,tfIdf_abraham,tfIdf_abrasive,tfIdf_abroad,tfIdf_abruptly,tfIdf_absence,tfIdf_absent,tfIdf_absentminde,tfIdf_absolute,tfIdf_absolutely,tfIdf_absorbent,tfIdf_abstain,tfIdf_absurd,tfIdf_abt,tfIdf_abundance,tfIdf_abuse,tfIdf_abusing,tfIdf_abusive,tfIdf_abyss,tfIdf_ac,...,tfIdf_yield,tfIdf_yoga,tfIdf_yogurt,tfIdf_yolanda,tfIdf_york,tfIdf_yorker,tfIdf_young,tfIdf_youth,tfIdf_youtube,tfIdf_yowl,tfIdf_yrs,tfIdf_yu,tfIdf_yucatan,tfIdf_yule,tfIdf_yum,tfIdf_yummy,tfIdf_zach,tfIdf_zack,tfIdf_zag,tfIdf_zaknelson,tfIdf_ze,tfIdf_zebra,tfIdf_zeke,tfIdf_zenlike,tfIdf_zero,tfIdf_zig,tfIdf_zionism,tfIdf_zionist,tfIdf_zip,tfIdf_zit,tfIdf_zoey,tfIdf_zog,tfIdf_zombie,tfIdf_zone,tfIdf_zoo,tfIdf_zookeeper,tfIdf_zoos,tfIdf_zuchini,tfIdf_zumba,tfIdf_zyklon
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16540,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16541,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
