In [1]:
import pandas as pd

In [8]:
user_node = pd.read_csv('User_node_imputed_gender.csv')

In [9]:
len(user_node)

1483

# Differences by gender

**Gender variable**:
- Individuals with missing gender in Wikidata had their gender imputed based on the first name.
    - Mostly_male and mostly_female are included in the respective male and female categories; andy and non-binary are included in the other category.
- Gender variable is then created the following way: if gender is missing in Wikidata, include imputed gender; otherwise, use Wikidata gender.

In [10]:
user_node.groupby('imputed_gender')['login'].nunique().reset_index().sort_values('login', ascending=False)

Unnamed: 0,imputed_gender,login
2,male,969
5,unknown,239
1,female,154
4,mostly_male,48
0,andy,47
3,mostly_female,24


In [11]:
user_node.groupby('sex or gender')['login'].nunique().reset_index().sort_values('login', ascending=False)

Unnamed: 0,sex or gender,login
1,male,767
0,female,116
2,non-binary,2


Create the "gender_updated" variable, if the gender is missing, then update with the imputed gender

In [12]:
user_node.loc[user_node['sex or gender'].isnull(), 'gender_updated'] = user_node['imputed_gender']

In [13]:
user_node.loc[user_node['sex or gender'].notnull(), 'gender_updated'] = user_node['sex or gender']

In [14]:
user_node.groupby('gender_updated')['login'].nunique().reset_index().sort_values('login', ascending=False)

Unnamed: 0,gender_updated,login
2,male,1051
1,female,171
6,unknown,170
0,andy,37
4,mostly_male,35
3,mostly_female,15
5,non-binary,2


In [15]:
# Compare with the package gender

In [16]:
user_node.loc[user_node['imputed_gender'] == 'unknown', 'gender_updated_imputed'] = user_node['sex or gender']

In [17]:
user_node.loc[user_node['imputed_gender'] != 'unknown', 'gender_updated_imputed'] = user_node['imputed_gender']

In [18]:
user_node.groupby('gender_updated_imputed')['login'].nunique().reset_index().sort_values('login', ascending=False)

Unnamed: 0,gender_updated_imputed,login
2,male,1025
1,female,167
4,mostly_male,48
0,andy,47
3,mostly_female,24


In [19]:
user_node.loc[user_node['gender_updated'] == 'mostly_male', 'gender_updated'] = 'male'
user_node.loc[user_node['gender_updated'] == 'mostly_female', 'gender_updated'] = 'female'

In [20]:
user_node.loc[user_node['gender_updated'] == 'andy', 'gender_updated'] = 'other'
user_node.loc[user_node['gender_updated'] == 'non-binary', 'gender_updated'] = 'other'

In [21]:
user_gender = user_node.groupby('gender_updated')['login'].nunique().reset_index().sort_values('login', ascending=False)

In [22]:
user_gender['total'] = user_gender['login'].sum()

In [23]:
user_gender['percentage'] = ((user_gender['login'] / user_gender['total'])*100).round(2)

In [25]:
user_gender = user_gender.rename(columns={'gender_updated':'gender'})

In [26]:
user_gender

Unnamed: 0,gender,login,total,percentage
1,male,1086,1481,73.33
0,female,186,1481,12.56
3,unknown,170,1481,11.48
2,other,39,1481,2.63


In a 2013 survey of the more than 2000 open source developers who indicated a gender, only 11.2% were women (Arjona-Reina, Robles & Dueas, 2014).
Source: https://peerj.com/articles/cs-111/

In [29]:
user_node.loc[user_node['gender_updated'] == 'male', 'hue'] = 'male'
user_node.loc[user_node['gender_updated'] == 'female', 'hue'] = 'female'

In [30]:
users = user_node.groupby('login').size().reset_index()

In [31]:
users[users[0] == 2]

Unnamed: 0,login,0
231,arives,2
932,mhpob,2


In [32]:
user_node = user_node.drop_duplicates(subset='login')

In [33]:
len(user_node)

1481

In [34]:
user_node.to_csv('User_node_gender_updated.csv', index=False)