# Gender Prediction Analysis @Foxintelligence

Create a gender prediction model based on newsletters that a user has suscribed to. It's a supervised learning case study. 

- nl_email : newsletter ID (unique for each newsletter)
- id : user ID
- gender: user gender

## Import Libraries

In [48]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import model_selection
from sklearn.model_selection import train_test_split
import time

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
mpl.style.use('ggplot')

## Import Modelling Libraries

In [30]:
# Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

# Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

## Import & Preview Data

In [31]:
data = pd.read_csv("dataset_test.csv")

In [32]:
#data.head(10)
data.tail(10)

Unnamed: 0,nl_email,id,gender
8584524,824773,838442,male
8584525,824929,162416,male
8584526,824996,1081281,male
8584527,825044,464017,male
8584528,825044,566353,male
8584529,825044,1416067,male
8584530,825077,587074,male
8584531,825406,149592,female
8584532,825406,175272,male
8584533,825406,277218,male


In [33]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8584534 entries, 0 to 8584533
Data columns (total 3 columns):
nl_email    int64
id          int64
gender      object
dtypes: int64(2), object(1)
memory usage: 196.5+ MB
None


In [34]:
data.describe(include='all')

Unnamed: 0,nl_email,id,gender
count,8584534.0,8584534.0,8584534
unique,,,2
top,,,male
freq,,,4865002
mean,472310.7,340281.2,
std,248344.4,300996.2,
min,0.0,1.0,
25%,260806.2,106049.0,
50%,503319.5,259826.0,
75%,723356.0,489695.0,


In [35]:
#create a work version of dataset
data_w = data.copy(deep = True)

In [36]:
# Look for null values
print(data_w.isnull().sum())

nl_email    0
id          0
gender      0
dtype: int64


##### -->no missing values, data appears to be clean

## Convert Formats

In [37]:
# Code categorical data : gender_code (male is '1' and female is '0'), nl_code and id_code
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
label = LabelEncoder()
data_w['nl_code'] = label.fit_transform(data_w['nl_email'])
data_w['id_code'] = label.fit_transform(data_w['id'])
data_w['gender_code'] = label.fit_transform(data_w['gender'])

In [38]:
# Define target variable and delete categorical column
Target = ['gender_code']
data_w = data_w.drop(columns = ['gender', 'nl_email', 'id'])

In [39]:
# Define X and Y variables for later split
# reminder : always convert the target with data.target or else the shape won't be adequate
X = data_w.drop(columns = ['gender_code'])
Y = data_w.gender_code

print('X shape =', X.shape)
print('Y shape =', Y.shape, '\n')
print('-' * 10)
print ('\n')
print ('X: \n', X.head(), '\n')
print('Y:\n', Y.head())

X shape = (8584534, 2)
Y shape = (8584534,) 

----------


X: 
    nl_code  id_code
0        0    23652
1    62788    23652
2    89982    23652
3   159185    23652
4   176642    23652 

Y:
 0    1
1    1
2    1
3    1
4    1
Name: gender_code, dtype: int64


## Split Training and Testing Data

In [40]:
X_train, X_test, y_train, y_test = \
train_test_split(X, Y, test_size=0.1)

## Feature Reduction

In [41]:
# Group Newsletters by users
users_by_nl = data_w['id_code'].groupby(data_w['nl_code']).count()
users_by_nl = pd.DataFrame(users_by_nl)
users_by_nl.columns = ['nb_users']

# Sort Descending
users_by_nl.sort_values(by='nb_users', ascending=False, inplace=True)

print (users_by_nl.head())
print ('\n')
print ('-'*10)
print ('\n')
print (users_by_nl.tail())
users_by_nl.describe(include='all')

         nb_users
nl_code          
728929      81792
577008      74552
593870      73961
110810      70888
159157      68377


----------


         nb_users
nl_code          
167121          1
453352          1
167123          1
453350          1
825408          1


Unnamed: 0,nb_users
count,825409.0
mean,10.40034
std,299.58956
min,1.0
25%,1.0
50%,1.0
75%,2.0
max,81792.0


##### On average, users subscribe to 10 newsletters. We can remove newsletters that have less than 10 subcribers since they are not representative of the gender split

In [42]:
# Delete newsletters that only have 1 or 2 subscribers
users_by_nl = users_by_nl[users_by_nl['nb_users'] >= 10]

users_by_nl.describe(include='all')

Unnamed: 0,nb_users
count,63764.0
mean,113.986968
std,1072.474806
min,10.0
25%,14.0
50%,23.0
75%,56.0
max,81792.0


In [43]:
# Now we apply this feature reduction to our dataset data_w
data_w = pd.merge(data_w, users_by_nl, on='nl_code')

print (data_w.head())

   nl_code  id_code  gender_code  nb_users
0    89982    23652            1      3190
1    89982    16503            0      3190
2    89982   104756            0      3190
3    89982   142738            0      3190
4    89982     9101            1      3190


In [44]:
# quick verification to make sure the merge was successful
print ("number of nl_code in new data set:", len(set(data_w.nl_code)))
print ("number of nl_code after dropping the irrelevant ones:", len(set(users_by_nl.index)))

number of nl_code in new data set: 63764
number of nl_code after dropping the irrelevant ones: 63764


## Exploratory Analysis 

In [45]:
# Let's look at correlations by gender
crosstab_nl = pd.crosstab(data_w['nl_code'],data_w[Target[0]])
crosstab_nl.head()

gender_code,0,1
nl_code,Unnamed: 1_level_1,Unnamed: 2_level_1
1,28,5
4,11,17
5,9,3
13,73,69
15,6,24


In [46]:
# Merge data
crosstab = pd.merge(crosstab_nl, users_by_nl, on='nl_code')
crosstab.columns = ['female', 'male', 'nb_users']
print(crosstab)
print('\n')
print("-"*10)
print('\n')
print(crosstab.describe(include='all'))

         female  male  nb_users
nl_code                        
1            28     5        33
4            11    17        28
5             9     3        12
13           73    69       142
15            6    24        30
18            7     8        15
43            5     6        11
49            3    19        22
50            5    13        18
73            8     4        12
77            6    23        29
86            2    16        18
93            5    18        23
105           5    17        22
108           5     7        12
111          14     8        22
121          41    89       130
125          12     7        19
155           1    14        15
186           9    19        28
198          12    12        24
204          13    14        27
231           3     7        10
237          11    20        31
240          12    23        35
252           9     4        13
268           1    11        12
294           3     7        10
300           4    67        71
302     

In [47]:
# Convert to precentage
#start = time.time()
#for col in range(2):
#    for row in range(crosstab_nl.shape[0]):
#        crosstab_nl[col][row] = (crosstab_nl[col][row] / (crosstab_nl[0][row] + crosstab_nl[1][row])) * 100.
#end = time.time()
#print(end - start)

## Modelling 

##### Knowing we are in a supervised learning context and that the variable we are trying to predict is discrete, we can use a classification algorithm. 

###### --> Logistic Regression

In [50]:
#apply Logistic Regression
# values.ravel: https://stackoverflow.com/questions/34165731/a-column-vector-y-was-passed-when-a-1d-array-was-expected
model = linear_model.LogisticRegression()
model.fit(X_train, y_train.values.ravel())



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [51]:
y_predicted = model.predict(X_test)

In [52]:
print(y_predicted)
np.unique(y_predicted)

[1 1 1 ... 1 1 1]


array([1])

In [53]:
model.predict_proba(X_test)

array([[0.46134239, 0.53865761],
       [0.41110786, 0.58889214],
       [0.4611337 , 0.5388663 ],
       ...,
       [0.45598247, 0.54401753],
       [0.48929777, 0.51070223],
       [0.46861618, 0.53138382]])

In [54]:
model.score(X_test,y_test)

0.5671055175932549

##### Logistic Regression always predicts male for gender and scores 57%. Let's find out if there is a better model.

###### --> Random Forest

In [55]:
#apply Random Forest
model = ensemble.RandomForestClassifier()
model.fit(X_train, y_train.values.ravel())



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [26]:
y_predicted = model.predict(X_test)
print(y_predicted)
print('unique values = ', np.unique(y_predicted))
model.predict_proba(X_test)

[1 0 0 ... 0 1 1]


array([[0.2, 0.8],
       [0.7, 0.3],
       [1. , 0. ],
       ...,
       [0.7, 0.3],
       [0.2, 0.8],
       [0. , 1. ]])

In [56]:
model.score(X_test,y_test)

0.7157087042520625

###### --> Naive Bayes

In [57]:
#apply Naive Bayes
model = naive_bayes.GaussianNB()
model.fit(X_train, y_train.values.ravel())

GaussianNB(priors=None, var_smoothing=1e-09)

In [60]:
y_predicted = model.predict(X_test)
print(y_predicted)
print ('unique values = ', np.unique(y_predicted))
model.predict_proba(X_test)

[1 1 1 ... 1 1 1]
[1]


array([[0.43772649, 0.56227351],
       [0.45192186, 0.54807814],
       [0.40078035, 0.59921965],
       ...,
       [0.45260204, 0.54739796],
       [0.41992624, 0.58007376],
       [0.44612696, 0.55387304]])

In [59]:
model.score(X_test,y_test)

0.5671055175932549