In [1]:
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
import spacy
import string
from nltk.corpus import stopwords
import math
from operator import itemgetter
from collections import Counter
import json
import numpy as np

import pandas as pd
import re
import os

In [2]:
df_fem = pd.read_json("../data/wikipedia_female_sample.json", lines=True)
df_fem.head()

Unnamed: 0,gender,id,name,occupation,overview,wiki-title
0,Q6581072,Q18921280,Aatka Feroz,"['Q4610556', 'Q18581305']","Aatka Ahmed (born October 4, 1994) born as Aat...",Aatka Feroz
1,Q6581072,Q19282420,Abigail Mott,[],"Abigail Lydia Mott Moore (August 6, 1795 – Sep...",Abigail Mott
2,Q6581072,Q4681610,Adela Serra-Ty,['Q82955'],Doña Adela Serra-Ty was a Filipino politician ...,Adela Serra-Ty
3,Q6581072,Q4681744,Adelaide Lucy Fenton,['Q1475726'],Adelaide Lucy Fenton (1824 or 1825 – 6 Februar...,Adelaide Lucy Fenton
4,Q6581072,Q19840348,Adet Lin,"['Q6625963', 'Q333634']","Adet Lin (; May 6, 1923 &ndash; 1971) was a Ch...",Adet Lin


In [3]:
df_male = pd.read_json("../data/wikipedia_male_sample.json", lines=True)
df_male.head()

Unnamed: 0,gender,id,name,occupation,overview,wiki-title
0,Q6581097,Q25259,2nd Dalai Lama,['Q82955'],"Gedun Gyatso, also Gendun Gyatso Palzangpo ( 1...",2nd Dalai Lama
1,Q6581097,Q4647736,A. E. de Silva,[],A. E. de Silva Snr. was a prominent businessma...,A. E. de Silva
2,Q6581097,Q18645699,A. Lee Chandler,['Q16533'],Archie Lee Chandler was an associate justice o...,A. Lee Chandler
3,Q6581097,Q4648236,A. P. Shanmugasundara Goundar,['Q82955'],A. P. Shanmugasundara Goundar is an Indian po...,A. P. Shanmugasundara Goundar
4,Q6581097,Q4648525,A. W. Vidmer,['Q2526255'],A. W. Vidmer is a film director and screenwri...,A. W. Vidmer


## NLP Analysis

In [4]:
# see grammatical categories
nlp = spacy.load('en')
doc = nlp(df_fem.iloc[0].overview)
pos_tags = set()
pos_tags.update([token.pos_ for token in doc])
pos_tags

{'ADJ',
 'ADP',
 'ADV',
 'AUX',
 'CCONJ',
 'DET',
 'NOUN',
 'NUM',
 'PART',
 'PRON',
 'PROPN',
 'PUNCT',
 'SCONJ',
 'VERB'}

In [5]:
# Load subjectivity lexicon
if os.path.exists('../data/subjectivity_dictionary.json'):
    subjectivity_dictionary = {}
    
    with open('../data/subjectivity_dictionary.json', 'r') as json_file:
        for item in eval(json_file.readline()):
            subjectivity_dictionary.update({item['word']: (item['strength'], item['subj'])})
else:
    subjectivity_dictionary = []

    with open("../data/subjectivity_lexicon.tff", "r") as file:
        for line in file:
            elements = line.split(" ")
            word = elements[2][6:]
            subjectivity_dictionary.append({'word': word, 'strength': elements[0][5:], 'subj': elements[5][14:-1]})

    with open('../data/subjectivity_dictionary.json', 'w') as outfile:
        json.dump(subjectivity_dictionary, outfile)
        
    subjectivity_dictionary = {}
    
    with open('../data/subjectivity_dictionary.json', 'r') as json_file:
        for item in eval(json_file.readline()):
            subjectivity_dictionary.update({item['word']: (item['strength'], item['subj'])})

In [6]:
def is_noun(token):
    # check the token is not a stop word
    if not token.is_stop:
        # check the token is a noun (common noun)
        return token.pos_ == 'NOUN'

In [7]:
def is_verb(token):
    # check the token is not a stop word
    if not token.is_stop:
        # check the token is a noun (common noun)
        return token.pos_ == 'VERB'

In [8]:
def is_adjective(token, dictionary):
    # check the token is not a stop word
    if not token.is_stop:
        # check the token is an adjective
        if token.pos_ == 'ADJ':
            # check the lemma of the token if in the dictionary
            return token.lemma_ in dictionary

In [9]:
def get_adjectives(overview):
    # get data to perform nlp analysis
    nlp = spacy.load('en') # english language
    doc = nlp(overview)
    # get lemma of the adjectives that are in the subjectivity lexicon
    adjs = [token.lemma_ for token in doc if is_adjective(token, subjectivity_dictionary)]
    return adjs

In [10]:
try:
    df_fem = pd.read_csv('../data/female_adjectives.csv')
    df_male = pd.read_csv('../data/male_adjectives.csv')
except: # takes a lot of time
    df_fem['adjectives'] = ''
    df_male['adjectives'] = ''

    df_fem['adjectives'] = df_fem.overview.map(lambda x: get_adjectives(x))
    df_male['adjectives'] = df_male.overview.map(lambda x: get_adjectives(x))
    
    df_fem.to_csv('../data/female_adjectives.csv', index=False)
    df_male.to_csv('../data/male_adjectives.csv', index=False)

We filter the dataset so that entries have at least 3 adjectives.

In [11]:
df_fem_filtered = df_fem[df_fem.adjectives.str.len() > 3].copy()
n_fem = len(df_fem_filtered)
n_fem

833

In [12]:
df_male_filtered = df_male[df_male.adjectives.str.len() > 3].copy()
n_male = len(df_male_filtered)
n_male

4492

In [13]:
df_male_filtered_balanced = df_male_filtered.iloc[:n_fem].copy()
len(df_male_filtered_balanced)

833

## Get most common adjectives

Computed with all the entries.

In [14]:
def most_common_words(list_words, n_most_common):
    most_common_count = Counter(list_words).most_common()[:n_most_common]
    most_common = [count[0] for count in most_common_count]
    return most_common

In [15]:
all_adj_fem = [item for sublist in df_fem['adjectives'].values for item in eval(sublist)]
all_adj_male = [item for sublist in df_male['adjectives'].values for item in eval(sublist)]

most_common_adj_fem = most_common_words(all_adj_fem, 100)
most_common_adj_male = most_common_words(all_adj_male, 100)

## Model

In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [17]:
most_common_adj = set()
most_common_adj.update(most_common_adj_male)
most_common_adj.update(most_common_adj_fem)
len(most_common_adj)

124

In [18]:
def encode_input(list_words_present, list_words_to_encode):
    encoding = np.zeros(len(list_words_to_encode))
    for i, adj in enumerate(list_words_to_encode):
        if adj in list_words_present:
            encoding[i] = 1
    return encoding

In [19]:
def encode_output(gender):
    return int(gender == 'Q6581072')

In [20]:
df_fem_filtered['input'] = ""
df_fem_filtered['input'] = df_fem_filtered.adjectives.map(lambda x: encode_input(x, most_common_adj))

df_male_filtered_balanced['input'] = ""
df_male_filtered_balanced['input'] = df_male_filtered_balanced.adjectives.map(lambda x: encode_input(x, most_common_adj))

In [21]:
df_fem_filtered['output'] = ""
df_fem_filtered['output'] = df_fem_filtered.gender.map(lambda x: encode_output(x))

df_male_filtered_balanced['output'] = ""
df_male_filtered_balanced['output'] = df_male_filtered_balanced.gender.map(lambda x: encode_output(x))

Get data to train the model:

In [22]:
X_fem = np.stack(df_fem_filtered.input)
y_fem = np.stack(df_fem_filtered.output)

X_male = np.stack(df_male_filtered_balanced.input)
y_male = np.stack(df_male_filtered_balanced.output)

In [23]:
X = np.concatenate((X_fem, X_male), axis=0)
y = np.concatenate((y_fem, y_male), axis=0)

In [24]:
X.shape

(1666, 124)

In [25]:
y.shape

(1666,)

In [26]:
x_train, x_test, y_train, y_test = train_test_split(X, y, random_state=1)

Model:

In [27]:
lr = LogisticRegression()
# train the model
lr.fit(x_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [28]:
# predict
y_pred = lr.predict(x_test)
y_pred

array([1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1,

In [29]:
# confusion matrix (true - rows, pred - cols)
cm = confusion_matrix(y_test, y_pred)
cm

array([[110, 103],
       [ 82, 122]])

In [30]:
# accuracy
lr.score(x_test, y_test)

0.5563549160671463

In [31]:
# get probabilities
lr.predict_proba(x_test)[0]

array([0.4768401, 0.5231599])

In [32]:
# get coefficients
sort_coef = sorted(abs(lr.coef_[0]), reverse=True)
sort_coef[:10]

[1.6617362860775489,
 1.3234934490414088,
 1.3104107198463313,
 1.2194090896208623,
 1.1995816684296743,
 1.0863782413205696,
 1.0843583354709745,
 1.0699550866846046,
 0.9968136375183463,
 0.9915611611094585]