# Gender Prediction Analysis @Foxintelligence

Create a gender prediction model based on newsletters that a user has suscribed to. It's a supervised learning case study. 

- nl_email : newsletter ID (unique for each newsletter)
- id : user ID
- gender: user gender

## Import Libraries

In [43]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import time

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
mpl.style.use('ggplot')

## Import Modelling Libraries

In [44]:
# Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

# Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

## Import & Preview Data

In [45]:
data = pd.read_csv("dataset_test.csv")

In [46]:
#data.head(10)
data.tail(10)

Unnamed: 0,nl_email,id,gender
8584524,824773,838442,male
8584525,824929,162416,male
8584526,824996,1081281,male
8584527,825044,464017,male
8584528,825044,566353,male
8584529,825044,1416067,male
8584530,825077,587074,male
8584531,825406,149592,female
8584532,825406,175272,male
8584533,825406,277218,male


In [47]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8584534 entries, 0 to 8584533
Data columns (total 3 columns):
nl_email    int64
id          int64
gender      object
dtypes: int64(2), object(1)
memory usage: 196.5+ MB
None


In [48]:
data.describe(include='all')

Unnamed: 0,nl_email,id,gender
count,8584534.0,8584534.0,8584534
unique,,,2
top,,,male
freq,,,4865002
mean,472310.7,340281.2,
std,248344.4,300996.2,
min,0.0,1.0,
25%,260806.2,106049.0,
50%,503319.5,259826.0,
75%,723356.0,489695.0,


In [49]:
#create a work version of dataset
data_w = data.copy(deep = True)

## Clean Data 

In [50]:
# Look for null values
print(data_w.isnull().sum())

nl_email    0
id          0
gender      0
dtype: int64


##### -->no missing values, data appears to be clean

## Convert Formats

In [51]:
# Code categorical data : gender_code (male is '1' and female is '0'), nl_code and id_code
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
label = LabelEncoder()
data_w['nl_code'] = label.fit_transform(data_w['nl_email'])
data_w['id_code'] = label.fit_transform(data_w['id'])
data_w['gender_code'] = label.fit_transform(data_w['gender'])

# Define target variable and delete categorical column
Target = ['gender_code']
data_bin = 
data_w = data_w.drop(columns = ['gender', 'nl_email', 'id'])

## Feature Reduction

In [52]:
# Group Newsletters by users
users_by_nl = data_w['id_code'].groupby(data_w['nl_code']).count()
users_by_nl = pd.DataFrame(users_by_nl)
users_by_nl.columns = ['nb_users']

# Sort Descending
users_by_nl.sort_values(by='nb_users', ascending=False, inplace=True)

print (users_by_nl.head())
print ('\n')
print ('-'*10)
print ('\n')
print (users_by_nl.tail())
users_by_nl.describe(include='all')

         nb_users
nl_code          
728929      81792
577008      74552
593870      73961
110810      70888
159157      68377


----------


         nb_users
nl_code          
167121          1
453352          1
167123          1
453350          1
825408          1


Unnamed: 0,nb_users
count,825409.0
mean,10.40034
std,299.58956
min,1.0
25%,1.0
50%,1.0
75%,2.0
max,81792.0


##### On average, users subscribe to 10 newsletters. We can confidently newsletters that have less than 10 subcribers since they are not representative of the gender split

In [53]:
# Delete newsletters that only have 1 or 2 subscribers
users_by_nl = users_by_nl[users_by_nl['nb_users'] >= 10]

users_by_nl.describe(include='all')

Unnamed: 0,nb_users
count,63764.0
mean,113.986968
std,1072.474806
min,10.0
25%,14.0
50%,23.0
75%,56.0
max,81792.0


In [54]:
# Now we apply this feature reduction to our dataset data_w
data_w = pd.merge(data_w, users_by_nl, on='nl_code')

print (data_w.head())

   nl_code  id_code  gender_code  nb_users
0    89982    23652            1      3190
1    89982    16503            0      3190
2    89982   104756            0      3190
3    89982   142738            0      3190
4    89982     9101            1      3190


In [63]:
# quick verification to make sure the merge was successful
print ("number of nl_code in new data set:", len(set(data_w.nl_code)))
print ("number of nl_code after dropping the irrelevant ones:", len(set(users_by_nl.index)))

number of nl_code in new data set: 63764
number of nl_code after dropping the irrelevant ones: 63764


## Exploratory Analysis 

In [21]:
# Let's look at correlations by gender
crosstab_nl = pd.crosstab(data_w['nl_code'],data_w[Target[0]])
crosstab_nl.head()

gender_code,0,1
nl_code,Unnamed: 1_level_1,Unnamed: 2_level_1
0,28,5
1,11,17
2,9,3
3,73,69
4,6,24


In [22]:
# Merge data
crosstab = pd.merge(crosstab_nl, users_by_nl, on='nl_code')
crosstab.columns = ['female', 'male', 'nb_users']
print(crosstab)
print('\n')
print("-"*10)
print('\n')
print(crosstab.describe(include='all'))

         female  male  nb_users
nl_code                        
0            28     5        33
1            11    17        28
2             9     3        12
3            73    69       142
4             6    24        30
5             7     8        15
6             5     6        11
7             3    19        22
8             5    13        18
9             8     4        12
10            6    23        29
11            2    16        18
12            5    18        23
13            5    17        22
14            5     7        12
15           14     8        22
16           41    89       130
17           12     7        19
18            1    14        15
19            9    19        28
20           12    12        24
21           13    14        27
22            3     7        10
23           11    20        31
24           12    23        35
25            9     4        13
26            1    11        12
27            3     7        10
28            4    67        71
29      

In [None]:
# Convert to precentage
#start = time.time()
#for col in range(2):
#    for row in range(crosstab_nl.shape[0]):
#        crosstab_nl[col][row] = (crosstab_nl[col][row] / (crosstab_nl[0][row] + crosstab_nl[1][row])) * 100.
#end = time.time()
#print(end - start)

## Modelling 

##### Knowing we are in a supervised learning context and that the variable we are trying to predict is discrete, we can use a classification algorithm. 

In [115]:
t = data_w[Target].values
print (t)
t = t.reshape(1, -1)
print (t)

[[1]
 [0]
 [0]
 ...
 [1]
 [1]
 [0]]
[[1 0 0 ... 1 1 0]]


In [116]:
print (t.shape)
print (data_w['nl_code'].shape)

(1, 7268265)
(7268265,)


In [114]:
# Algorith selection and initialization (from sklearn)
MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),
    
    #Gaussian Process
    gaussian_process.GaussianProcessClassifier(),
    
    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    
    #Naive Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #SVM
    svm.LinearSVC(),
    
    #Tree
    tree.DecisionTreeClassifier(),

    #xgboost
    XGBClassifier()
    ]

# run model 10x with 60/30 split intentionally leaving out 10%
cv_split = model_selection.ShuffleSplit(n_splits = 10, \
            test_size = .3, train_size = .6, random_state = 0)
#print (cv_split)

# create table to compare models
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', \
               'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)

# create table to compare predictions 
MLA_predict = data_w[Target]

# index through models and save performance to table
row_index = 0
for alg in MLA:
    #give names and set parameters
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    
    #score models with cross validation
    target_reshaped = data_w[Target].values
    target_reshaped = target_reshaped.reshape(1, -1)
    cv_results = model_selection.cross_validate(alg, data_w['id_code'], \
            target_reshaped, cv = cv_split)
    
    MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
    MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
    MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()
    #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, 
    #should statistically capture 99.7% of the subsets
    MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3
    
    #save MLA predictions 
    alg.fit(data_w['nl_code'], data_w[Target])
    MLA_predict[MLA_name] = alg.predict(data_w['nl_code'])
    
    row_index+=1
    
# print and sort table:
MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
MLA_compare

ValueError: Found input variables with inconsistent numbers of samples: [7268265, 1]

In [None]:
data_w['nl_code'].shape

In [None]:
data_w['gender_code'].shape