<a href="https://colab.research.google.com/github/mvdheram/Stereotypical-Social-bias-detection-/blob/Machine-learning-classifiers/Feature_based_machine_learning_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Naive- Bayees Model

# SVM with selected features 

* Reference : 
    1. Linguistic models for detecting bias https://aclanthology.org/P13-1162.pdf
    2. Automatically Neutralizing Subjective Bias in Text https://ojs.aaai.org/index.php/AAAI/article/view/5385 

Features :

* Bias lexicons with count
* Sentiment 
* Generic words NNS and NNPS
* Toxicity 
* Generic features 
* A, AE names

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd

df = pd.read_csv('/content/drive/MyDrive/Trained models/mult_label_dataset/ohe_multilabel.csv', index_col = 0)

In [None]:
df

Unnamed: 0,Ethnicity,gender,profession,religion,Anti-stereotype,stereotype,unrelated,sentence
0,1,0,0,0,0,1,0,Many people live in Ethiopia. The people are v...
1,1,0,0,0,0,1,0,My professor is a hispanic man He came here il...
2,0,1,0,0,0,1,0,The schoolgirl is walking down the street. She...
3,1,0,0,0,0,1,0,We'll likely fly over Cape Verde on our flight...
4,1,0,0,0,0,1,0,I know many people who are Russian. They love ...
...,...,...,...,...,...,...,...,...
16539,0,0,0,0,0,0,1,Arab people are associated with hi.
16540,0,0,0,0,0,0,1,Car was a new chemist in our lab.
16541,0,0,0,0,0,0,1,"The gentlemen stood around the fire, carefully..."
16542,0,0,0,0,0,0,1,The common ukrainian is a supporter of a floun...


## Pre-processing

In [None]:
# Tokenization using spacy
import spacy

nlp = spacy.load("en_core_web_sm")

def tokenize(text):  

  doc = nlp(text)
  tokens = [token.text.lower() for token in doc]
  return tokens

def lemmatization(text):

  doc = nlp(text)
  lemmas = [token.lemma_.lower() for token in doc]
  return lemmas

# Remove tokens that are not alphabetic - depends on particular application 
def clean_text(text):

  lemmas = lemmatization(text)
  a_lemmas = [lemma for  lemma in lemmas
              if lemma.isalpha()]
  
  return (' '.join(a_lemmas))


# Remove stopwords - Update according to stereotypical bias 
def remove_stopwords(text):

  stopwords = spacy.lang.en.stop_words.STOP_WORDS
  
  lemmas = lemmatization(text)
  a_lemmas = [lemma for  lemma in lemmas
              if lemma.isalpha() and lemma not in stopwords]
  
  return (' '.join(a_lemmas))


# Parts of speech tagger 
def pos_tags(text):
  
  doc = nlp(text)
  pos = [(token.text, token.tag_) for token in doc]
  return (pos)


# Named entity recognition 
def ner_tags(text):
  
  doc = nlp(text)
  ner = [(ent.text, ent.label_) for ent in doc.ents]
  return (ner)

In [None]:
stereo = df.copy()
stereo['clean_text'] = stereo['sentence'].apply(clean_text)

In [None]:
stereo['pos_tags'] = stereo['clean_text'].apply(pos_tags) 

In [None]:
stereo['lemmatized_withStopwords']= stereo['clean_text'].apply(remove_stopwords)

In [None]:
stereo['lemma_pos'] = stereo['lemmatized_withStopwords'].apply(pos_tags)

In [None]:
stereo['Ner_tags'] = stereo['clean_text'].apply(ner_tags)

In [None]:
stereo

Unnamed: 0,Ethnicity,gender,profession,religion,Anti-stereotype,stereotype,unrelated,sentence,clean_text,lemmatized_withStopwords,num_chars,num_words,avg_word_length,flesch_score,subjectivity_score,neg,neu,pos,assertive_verbs_count,factive_verbs_count,hedges_count,implicative_verbs_count,report_verbs_count,bias_word_list_01_2018_count,subjectivityClues_lexicon_count,pos_tags,lemma_pos,Ner_tags
0,1,0,0,0,0,1,0,Many people live in Ethiopia. The people are v...,many people live in ethiopia the people be ver...,people live ethiopia people thin good distance...,84,15,4.666667,89.24,0.0,0.000,0.816,0.184,0,0,1,0,0,4,5,"[(many, JJ), (people, NNS), (live, VBP), (in, ...","[(people, NNS), (live, VBP), (ethiopia, NNP), ...","[(ethiopia, GPE)]"
1,1,0,0,0,0,1,0,My professor is a hispanic man He came here il...,professor be a hispanic man come here illegally,professor hispanic man come illegally,53,10,4.400000,52.87,0.0,0.000,1.000,0.000,0,0,0,0,0,2,12,"[(professor, NNP), (be, VB), (a, DT), (hispani...","[(professor, NNP), (hispanic, NNP), (man, NN),...","[(hispanic, NORP)]"
2,0,1,0,0,0,1,0,The schoolgirl is walking down the street. She...,the schoolgirl be walk down the street be a ta...,schoolgirl walk street target child predator,79,14,4.714286,89.75,0.0,0.000,1.000,0.000,0,0,1,1,0,2,3,"[(the, DT), (schoolgirl, NN), (be, VBP), (walk...","[(schoolgirl, NNP), (walk, NNP), (street, NNP)...",[]
3,1,0,0,0,0,1,0,We'll likely fly over Cape Verde on our flight...,will likely fly over cape verde on flight tomo...,likely fly cape verde flight tomorrow bet peop...,106,20,4.350000,86.71,0.0,0.105,0.759,0.135,0,0,2,0,0,4,9,"[(will, MD), (likely, RB), (fly, VB), (over, I...","[(likely, RB), (fly, VB), (cape, NN), (verde, ...","[(tomorrow, DATE)]"
4,1,0,0,0,0,1,0,I know many people who are Russian. They love ...,know many people who be russian love vodka,know people russian love vodka,58,11,4.363636,91.27,0.0,0.000,0.690,0.310,0,1,0,0,0,2,3,"[(know, VBP), (many, JJ), (people, NNS), (who,...","[(know, VBP), (people, NNS), (russian, JJ), (l...","[(russian, NORP)]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16539,0,0,0,0,0,0,1,Arab people are associated with hi.,arab people be associate with hi,arab people associate hi,35,6,5.000000,48.47,0.0,0.000,1.000,0.000,0,0,0,0,0,2,1,"[(arab, JJ), (people, NNS), (be, VB), (associa...","[(arab, JJ), (people, NNS), (associate, VBP), ...","[(arab, NORP)]"
16540,0,0,0,0,0,0,1,Car was a new chemist in our lab.,car be a new chemist in lab,car new chemist lab,33,8,3.250000,114.12,0.0,0.000,1.000,0.000,0,0,0,0,0,2,0,"[(car, NN), (be, VB), (a, DT), (new, JJ), (che...","[(car, NN), (new, JJ), (chemist, NN), (lab, NN)]",[]
16541,0,0,0,0,0,0,1,"The gentlemen stood around the fire, carefully...",the gentleman stand around the fire carefully ...,gentleman stand fire carefully pass boxing,73,11,5.727273,68.77,0.0,0.186,0.698,0.116,0,1,1,2,0,7,8,"[(the, DT), (gentleman, NNP), (stand, VBP), (a...","[(gentleman, NNP), (stand, VB), (fire, NN), (c...",[]
16542,0,0,0,0,0,0,1,The common ukrainian is a supporter of a floun...,the common ukrainian be a supporter of a floun...,common ukrainian supporter flounder run govern...,65,11,5.000000,60.31,0.0,0.000,0.826,0.174,0,0,2,0,1,4,9,"[(the, DT), (common, JJ), (ukrainian, NNP), (b...","[(common, JJ), (ukrainian, JJ), (supporter, NN...","[(ukrainian, NORP)]"


In [None]:
stereo.to_csv('stereo_features.csv')

## Feature engineering

Scoring features :


* Readability tests :
  https://pypi.org/project/textatistic/
  * Determine readability of english passage
  * Scale ranging from primary school up to college graduate level
  * A mathematical formula utilizing word, syllabel and sentence count
  * Used in fake news and opinion spam detection 

  Types :

  1. Flesch reading ease : 

    * The higher the score, the better the readability. 
    * score of 0-30 implies only college graduates can understand while 90-100 implies that a 5th grade student can understand.
    
    Two factors :

      1. Greater the average sentence length, harder the text to read
      2. Greater the average number of syllables, harder the text to read

* Avg_tf_idf
* Max_tf_idf
* Number of characters 
* Word count
* Average word length
* Vadar Sentiment analysis
* Text subjectivity (Text blob)
* Toxicity analysis (detoxify)



In [None]:
import pandas as pd

stereo = pd.read_csv('/content/drive/MyDrive/Trained models/mult_label_dataset/stereo_features_f.csv',index_col=0)

In [None]:
stereo

Unnamed: 0,Ethnicity,gender,profession,religion,Anti-stereotype,stereotype,unrelated,sentence,clean_text,pos,lemmatized_withStopwords,lemma_pos,Ner_tags
0,1,0,0,0,0,1,0,Many people live in Ethiopia. The people are v...,many people live in ethiopia the people be ver...,"[('many', 'JJ'), ('people', 'NNS'), ('live', '...",people live ethiopia people thin good distance...,"[('people', 'NNS'), ('live', 'VBP'), ('ethiopi...","[('ethiopia', 'GPE')]"
1,1,0,0,0,0,1,0,My professor is a hispanic man He came here il...,professor be a hispanic man come here illegally,"[('professor', 'NNP'), ('be', 'VB'), ('a', 'DT...",professor hispanic man come illegally,"[('professor', 'NNP'), ('hispanic', 'NNP'), ('...","[('hispanic', 'NORP')]"
2,0,1,0,0,0,1,0,The schoolgirl is walking down the street. She...,the schoolgirl be walk down the street be a ta...,"[('the', 'DT'), ('schoolgirl', 'NN'), ('be', '...",schoolgirl walk street target child predator,"[('schoolgirl', 'NNP'), ('walk', 'NNP'), ('str...",[]
3,1,0,0,0,0,1,0,We'll likely fly over Cape Verde on our flight...,will likely fly over cape verde on flight tomo...,"[('will', 'MD'), ('likely', 'RB'), ('fly', 'VB...",likely fly cape verde flight tomorrow bet peop...,"[('likely', 'RB'), ('fly', 'VB'), ('cape', 'NN...","[('tomorrow', 'DATE')]"
4,1,0,0,0,0,1,0,I know many people who are Russian. They love ...,know many people who be russian love vodka,"[('know', 'VBP'), ('many', 'JJ'), ('people', '...",know people russian love vodka,"[('know', 'VBP'), ('people', 'NNS'), ('russian...","[('russian', 'NORP')]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16539,0,0,0,0,0,0,1,Arab people are associated with hi.,arab people be associate with hi,"[('arab', 'JJ'), ('people', 'NNS'), ('be', 'VB...",arab people associate hi,"[('arab', 'JJ'), ('people', 'NNS'), ('associat...","[('arab', 'NORP')]"
16540,0,0,0,0,0,0,1,Car was a new chemist in our lab.,car be a new chemist in lab,"[('car', 'NN'), ('be', 'VB'), ('a', 'DT'), ('n...",car new chemist lab,"[('car', 'NN'), ('new', 'JJ'), ('chemist', 'NN...",[]
16541,0,0,0,0,0,0,1,"The gentlemen stood around the fire, carefully...",the gentleman stand around the fire carefully ...,"[('the', 'DT'), ('gentleman', 'NNP'), ('stand'...",gentleman stand fire carefully pass boxing,"[('gentleman', 'NNP'), ('stand', 'VB'), ('fire...",[]
16542,0,0,0,0,0,0,1,The common ukrainian is a supporter of a floun...,the common ukrainian be a supporter of a floun...,"[('the', 'DT'), ('common', 'JJ'), ('ukrainian'...",common ukrainian supporter flounder run govern...,"[('common', 'JJ'), ('ukrainian', 'JJ'), ('supp...","[('ukrainian', 'NORP')]"


In [None]:
scoring_features = stereo.copy()

In [None]:
scoring_features.drop(['pos','lemma_pos',	'Ner_tags'],axis=1, inplace= True)

In [None]:
 # Number of characters
 scoring_features['num_chars']  = scoring_features['sentence'].apply(len)

In [None]:
# Number of words
def word_count(string):
  # split the string into words
  words = string.split()

  # Return length of words list
  return len(words)

scoring_features['num_words'] = scoring_features['sentence'].apply(word_count)

In [None]:
# Average word length
def avg_word_length(x):

  # Split the string into words
  words = x.split()

  # Compute length of each word and store in a seperate list
  word_lengths = [len(word) for word in words]

  # Compute average word length 
  try:
    avg_word_length = sum(word_lengths)/len(words)
  except ZeroDivisionError:
    avg_word_length = 0

  return (avg_word_length)

scoring_features['avg_word_length'] = scoring_features['sentence'].apply(avg_word_length) 

In [None]:
scoring_features.columns

Index(['Ethnicity', 'gender', 'profession', 'religion', 'Anti-stereotype',
       'stereotype', 'unrelated', 'sentence', 'clean_text',
       'lemmatized_withStopwords', 'num_chars', 'num_words',
       'avg_word_length'],
      dtype='object')

In [None]:
pip install textstat



In [None]:
# Readability tests using textatistic library 
# Import the textatistic class
import textstat
import math

def readability_scores(text):
  # if text.endswith(".") == False:
  #   text = text+"."
  readability_score = textstat.flesch_reading_ease(text)

  # Generate scores
  return readability_score

In [None]:
try:
  scoring_features['flesch_score'] = scoring_features['sentence'].apply(readability_scores)
except ZeroDivisionError:
  scoring_features['flesch_score'] = 0

In [None]:
pip install -U textblob



In [None]:
from textblob import TextBlob

def get_subjectivity(text):
    try:
        textblob = TextBlob(unicode(text, 'utf-8'))
        subj = textblob.sentiment.subjectivity
    except:
        subj = 0.0
    return subj

In [None]:
scoring_features['subjectivity_score'] = scoring_features['sentence'].apply(get_subjectivity)

Vectorization :

* n_grams
* tf_idf 

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Building n-gram models - capture context
# Range = (2,2) - bi-grams, (1,3) - unigram, bigram, trigram
def n_grams(range, corpus):
  # Bag of words feature - docxterm matrix 
  vectorizer = CountVectorizer(ngram_range = range)
  corpus = corpus.values.astype('U')
  bow_matrix = vectorizer.fit_transform(corpus)
  cv_df = pd.DataFrame(bow_matrix.toarray(), columns = vectorizer.get_feature_names()).add_prefix('Counts_')
  # corpus = pd.concat([corpus,cv_df],axis = 1, sort = False)
  return cv_df

In [None]:
# tf-idf  - higher the weight more the importance 
# Used for train set
from sklearn.feature_extraction.text import TfidfVectorizer

def tf_idf(corpus):
  vectorizer = TfidfVectorizer()
  vectorizer = TfidfVectorizer(max_features = 10000)
  corpus = corpus.values.astype('U')
  tfidf_matrix = vectorizer.fit_transform(corpus)
  tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns = vectorizer.get_feature_names()).add_prefix('tfIdf_')
  # corpus = pd.concat([corpus,tfidf_df],axis = 1, sort = False)
  return tfidf_df

In [None]:
# Inspect the different words being values after BOW and tfidf transformation 
def examine_row(corpus,row_n):
  examine_row = corpus.iloc[row_n]
  print(examine_row.sort_values(ascending= False).head())
  total = corpus.sum()
  print("Total sum of the counts per word \n",total.head()) # Total sum of the counts per word
  # print("Sums sorted: ",total.sort_values(ascending= False).head())

In [None]:
c = tf_idf(scoring_features['clean_text'])

Vadar sentiment analysis

In [None]:
pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l[K     |██▋                             | 10 kB 19.8 MB/s eta 0:00:01[K     |█████▏                          | 20 kB 25.1 MB/s eta 0:00:01[K     |███████▉                        | 30 kB 23.3 MB/s eta 0:00:01[K     |██████████▍                     | 40 kB 20.8 MB/s eta 0:00:01[K     |█████████████                   | 51 kB 6.4 MB/s eta 0:00:01[K     |███████████████▋                | 61 kB 7.3 MB/s eta 0:00:01[K     |██████████████████▏             | 71 kB 7.8 MB/s eta 0:00:01[K     |████████████████████▉           | 81 kB 8.7 MB/s eta 0:00:01[K     |███████████████████████▍        | 92 kB 8.5 MB/s eta 0:00:01[K     |██████████████████████████      | 102 kB 6.7 MB/s eta 0:00:01[K     |████████████████████████████▋   | 112 kB 6.7 MB/s eta 0:00:01[K     |███████████████████████████████▏| 122 kB 6.7 MB/s eta 0:00:01[K     |████████████████████████████████| 125 kB 6.7 M

In [None]:
# Sentiment analysis 
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

analyser = SentimentIntensityAnalyzer()

def vader_sentiment(text):
  score = analyser.polarity_scores(text)
  return score

senti = scoring_features['sentence'].apply(vader_sentiment) 
scoring_features = pd.concat([scoring_features,(pd.DataFrame.from_dict(dict(senti).values()))],axis = 1, sort = False)
# scoring_features.head()

In [None]:
pip install detoxify

Collecting detoxify
  Downloading detoxify-0.2.2-py3-none-any.whl (11 kB)
Collecting transformers>=3.2.0
  Downloading transformers-4.9.2-py3-none-any.whl (2.6 MB)
[K     |████████████████████████████████| 2.6 MB 7.4 MB/s 
[?25hCollecting sentencepiece>=0.1.94
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 52.9 MB/s 
Collecting huggingface-hub==0.0.12
  Downloading huggingface_hub-0.0.12-py3-none-any.whl (37 kB)
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 37.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 71.6 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K

In [None]:
# Toxicity identification 
from detoxify import Detoxify

def toxicity(text):
  results = Detoxify('original').predict(text)
  return math.floor(results['toxicity']*100)

scoring_features['toxicity'] = scoring_features['sentence'].apply(toxicity) 

Count based features :

* Lexicons - Counts
  * Hedge in context  - two words around W is a hedge (Hyland, 2005  (e.g., apparently).
  * Factive verb  - w is in Hooper’s (1975) list of factives (e.g., realize).
  * Factive verb in context One/two word(s) around w is a factive (Hooper, 1975)
  * Assertive verb
  * Assertive verb in context 
  * Assertive verb 
  * Implicative verb in context
  * Report verb
  * Entailment
  * Entailment in context
  * Strong subjective
  * Weak subjective
  * Positive word
  * Positive word in context
  * Negative word
  * Negative word in context
  * Grammatical relation - {root,subj,...}
  * Bias lexicon
* Social category target words used in dataset( Characteristic words of each bias type ; e.g. Racial, gender, ..) and scoring_features_pos_Ner
* Characteristic stereotypical words 
* POS :
  * POS(word) : POS of word w 
  * POS(word) - 1 :  POS of one word before w
  * POS(word) + 1  : POS of one word after w


Lexicons

In [None]:
import json 

f = open('/content/Subjectivity_lexicon.json')

lexicons = json.load(f)

In [None]:
for keys, value in lexicons.items():
  print(keys,'->',len(value.split('\n')))

assertive_verbs.txt -> 66
bias_lexicon.txt -> 655
bias_word_list_01_2018.txt -> 9742
factive_verbs.txt -> 27
hedges_hyland2005.txt -> 100
implicative_verbs.txt -> 32
report_verbs.txt -> 181
subjectivityClues_lexicon.txt -> 8223


In [None]:
keys = lexicons.keys()

In [None]:
def count_lexicon(text):
  count = 0
  try:
    for token in lexicon:
      if token in text:
        count +=1
      else:
        continue
  except :
    pass
  return count

Assertive verbs

In [None]:
lexicon = set(tokenize(lexicons['assertive_verbs.txt']))

In [None]:
scoring_features['assertive_verbs_count'] = scoring_features['lemmatized_withStopwords'].apply(count_lexicon)

In [None]:
len(scoring_features[scoring_features['assertive_verbs_count'] != 0])

1520

Factive verbs

In [None]:
lexicon = set(tokenize(lexicons['factive_verbs.txt']))

In [None]:
scoring_features['factive_verbs_count'] = scoring_features['lemmatized_withStopwords'].apply(count_lexicon)

In [None]:
len(scoring_features[scoring_features['factive_verbs_count'] != 0])

1768

Hedges

In [None]:
lexicon = set(tokenize(lexicons['hedges_hyland2005.txt']))

In [None]:
scoring_features['hedges_count'] = scoring_features['lemmatized_withStopwords'].apply(count_lexicon)

In [None]:
len(scoring_features[scoring_features['hedges_count'] != 0])

9331

Implicative_verbs

In [None]:
lexicon = set(tokenize(lexicons['implicative_verbs.txt']))

In [None]:
scoring_features['implicative_verbs_count'] = scoring_features['lemmatized_withStopwords'].apply(count_lexicon)

In [None]:
len(scoring_features[scoring_features['implicative_verbs_count'] != 0])

1732

Report_verbs

In [None]:
lexicon = set(tokenize(lexicons['report_verbs.txt']))

In [None]:
scoring_features['report_verbs_count'] = scoring_features['lemmatized_withStopwords'].apply(count_lexicon)

In [None]:
len(scoring_features[scoring_features['report_verbs_count'] != 0])

3884

Bias_word_list_01_2018

In [None]:
lexicon = set(tokenize(lexicons['bias_word_list_01_2018.txt']))

In [None]:
scoring_features['bias_word_list_01_2018_count'] = scoring_features['lemmatized_withStopwords'].apply(count_lexicon)

In [None]:
len(scoring_features[scoring_features['bias_word_list_01_2018_count'] != 0])

15043

SubjectivityClues_lexicon

In [None]:
lexicon = set(tokenize(lexicons['subjectivityClues_lexicon.txt']))

In [None]:
scoring_features['subjectivityClues_lexicon_count'] = scoring_features['lemmatized_withStopwords'].apply(count_lexicon)

In [None]:
len(scoring_features[scoring_features['subjectivityClues_lexicon_count'] != 0])

15652

In [None]:
scoring_features.to_csv("scoring_features.csv")

POS tags 

In [None]:
scoring_features = pd. read_csv("/content/drive/MyDrive/Trained models/mult_label_dataset/features_with_pos.csv", index_col = 0)

In [None]:
scoring_features.columns

Index(['Ethnicity', 'gender', 'profession', 'religion', 'Anti-stereotype',
       'stereotype', 'unrelated', 'sentence', 'clean_text',
       'lemmatized_withStopwords', 'num_chars', 'num_words', 'avg_word_length',
       'flesch_score', 'subjectivity_score', 'neg', 'neu', 'pos',
       'assertive_verbs_count', 'factive_verbs_count', 'hedges_count',
       'implicative_verbs_count', 'report_verbs_count',
       'bias_word_list_01_2018_count', 'subjectivityClues_lexicon_count',
       'pos_tags', 'lemma_pos', 'Ner_tags'],
      dtype='object')

In [None]:
scoring_features.pos_tags

0        [('many', 'JJ'), ('people', 'NNS'), ('live', '...
1        [('professor', 'NNP'), ('be', 'VB'), ('a', 'DT...
2        [('the', 'DT'), ('schoolgirl', 'NN'), ('be', '...
3        [('will', 'MD'), ('likely', 'RB'), ('fly', 'VB...
4        [('know', 'VBP'), ('many', 'JJ'), ('people', '...
                               ...                        
16539    [('arab', 'JJ'), ('people', 'NNS'), ('be', 'VB...
16540    [('car', 'NN'), ('be', 'VB'), ('a', 'DT'), ('n...
16541    [('the', 'DT'), ('gentleman', 'NNP'), ('stand'...
16542    [('the', 'DT'), ('common', 'JJ'), ('ukrainian'...
16543    [('be', 'VB'), ('a', 'DT'), ('loop', 'NN'), ('...
Name: pos_tags, Length: 16544, dtype: object

In [60]:
import ast

for word, tag in ast.literal_eval(scoring_features.pos_tags[0]):
  print(word , "->", tag)

many -> JJ
people -> NNS
live -> VBP
in -> IN
ethiopia -> NNP
the -> DT
people -> NNS
be -> VB
very -> RB
thin -> JJ
and -> CC
good -> JJ
at -> IN
distance -> NN
run -> NN


In [None]:
for index, tuple in enumerate(scoring_features.pos_tags[0]):
  print(index, tuple)

In [61]:
import ast

def pos_count(text,part_of_speech = 'NNS'):
  pos_type = []
  # Combining lists of lists into single list 
  pos_list = ast.literal_eval(text)
  for word,tag in pos_list :
    if tag == part_of_speech:
      pos_type.append(word)
  return len(pos_type)

In [62]:
scoring_features['NNS_count'] = scoring_features['pos_tags'].apply(pos_count)

## Training

In [None]:
MAX_LEN = 50
RANDOM_SEED = 47

In [None]:
y = df.iloc[:,:-1].values
X = df.iloc[:,-1].values

In [None]:
LABEL_COLUMN = ['Ethnicity',	'gender'	,'profession'	,'religion',	'Anti-stereotype',	'stereotype',	'unrelated']

In [None]:
from sklearn.model_selection import train_test_split

train_df_text, test_df_text, train_df_labels,test_df_labels = train_test_split(X,y, test_size=0.3, random_state=RANDOM_SEED, stratify = y)
val_df_text, test_df_text, val_df_labels,test_df_labels = train_test_split(test_df_text,test_df_labels, test_size=0.5, random_state=RANDOM_SEED,stratify = test_df_labels)

In [None]:
train_df_labels = pd.DataFrame(train_df_labels, columns= LABEL_COLUMN)
val_df_labels = pd.DataFrame(val_df_labels, columns= LABEL_COLUMN)
test_df_labels = pd.DataFrame(test_df_labels, columns= LABEL_COLUMN)
train_df_text = pd.DataFrame(train_df_text, columns = ['sentence'])
val_df_text = pd.DataFrame(val_df_text, columns = ['sentence'])
test_df_text = pd.DataFrame(test_df_text, columns = ['sentence'])

In [None]:
train_df = pd.concat([train_df_text,train_df_labels], axis = 1)
val_df = pd.concat([val_df_text,val_df_labels], axis = 1)
test_df = pd.concat([test_df_text,test_df_labels], axis = 1)