# Gender Prediction Analysis @Foxintelligence

Create a gender prediction model based on newsletters that a user has suscribed to. It's a supervised learning case study. 

- nl_email : newsletter ID (unique for each newsletter)
- id : user ID
- gender: user gender

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import time

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
mpl.style.use('ggplot')

## Import Modelling Libraries

In [2]:
# Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

# Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

## Import & Preview Data

In [3]:
data = pd.read_csv("dataset_test.csv")

In [4]:
#data.head(10)
data.tail(10)

Unnamed: 0,nl_email,id,gender
8584524,824773,838442,male
8584525,824929,162416,male
8584526,824996,1081281,male
8584527,825044,464017,male
8584528,825044,566353,male
8584529,825044,1416067,male
8584530,825077,587074,male
8584531,825406,149592,female
8584532,825406,175272,male
8584533,825406,277218,male


In [5]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8584534 entries, 0 to 8584533
Data columns (total 3 columns):
nl_email    int64
id          int64
gender      object
dtypes: int64(2), object(1)
memory usage: 196.5+ MB
None


In [6]:
data.describe(include='all')

Unnamed: 0,nl_email,id,gender
count,8584534.0,8584534.0,8584534
unique,,,2
top,,,male
freq,,,4865002
mean,472310.7,340281.2,
std,248344.4,300996.2,
min,0.0,1.0,
25%,260806.2,106049.0,
50%,503319.5,259826.0,
75%,723356.0,489695.0,


In [7]:
#create a work version of dataset
data_w = data.copy(deep = True)

## Clean Data 

In [8]:
# Look for null values
print(data_w.isnull().sum())

nl_email    0
id          0
gender      0
dtype: int64


##### -->no missing values, data appears to be clean

## Convert Formats

In [9]:
# Code categorical data : gender_code (male is '1' and female is '0'), nl_code and id_code
# https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
label = LabelEncoder()
data_w['nl_code'] = label.fit_transform(data_w['nl_email'])
data_w['id_code'] = label.fit_transform(data_w['id'])
data_w['gender_code'] = label.fit_transform(data_w['gender'])

# Define target variable and delete categorical column
Target = ['gender_code']
data_w.drop(columns = ['gender', 'nl_email', 'id'])

Unnamed: 0,nl_code,id_code,gender_code
0,0,23652,1
1,62788,23652,1
2,89982,23652,1
3,159185,23652,1
4,176642,23652,1
5,178646,23652,1
6,29073,23652,1
7,235088,23652,1
8,237356,23652,1
9,251901,23652,1


## Feature Reduction

In [10]:
# Group Newsletters by users
users_by_nl = data_w['id_code'].groupby(data_w['nl_code']).count()
users_by_nl = pd.DataFrame(users_by_nl)
users_by_nl.columns = ['nb_users']

# Sort Descending
users_by_nl.sort_values(by='nb_users', ascending=False, inplace=True)

print (users_by_nl.head())
print ('\n')
print ('-'*10)
print ('\n')
print (users_by_nl.tail())
users_by_nl.describe(include='all')

         nb_users
nl_code          
728929      81792
577008      74552
593870      73961
110810      70888
159157      68377


----------


         nb_users
nl_code          
167121          1
453352          1
167123          1
453350          1
825408          1


Unnamed: 0,nb_users
count,825409.0
mean,10.40034
std,299.58956
min,1.0
25%,1.0
50%,1.0
75%,2.0
max,81792.0


##### On average, users subscribe to 10 newsletters. We can confidently newsletters that have less than 10 subcribers since they are not representative of the gender split

In [11]:
data_w

Unnamed: 0,nl_email,id,gender,nl_code,id_code,gender_code
0,0,63150,male,0,23652,1
1,62788,63150,male,62788,23652,1
2,89982,63150,male,89982,23652,1
3,159185,63150,male,159185,23652,1
4,176642,63150,male,176642,23652,1
5,178646,63150,male,178646,23652,1
6,29073,63150,male,29073,23652,1
7,235088,63150,male,235088,23652,1
8,237356,63150,male,237356,23652,1
9,251901,63150,male,251901,23652,1


In [12]:
# Delete newsletters that only have 1 or 2 subscribers
users_by_nl = users_by_nl[users_by_nl['nb_users'] >= 10]

users_by_nl.describe(include='all')

Unnamed: 0,nb_users
count,63764.0
mean,113.986968
std,1072.474806
min,10.0
25%,14.0
50%,23.0
75%,56.0
max,81792.0


In [None]:
# Now we apply this feature reduction to our dataset data_w
def drop_rows(df, col_name, index):
    """function that drops rows if the value in col is not in index
    df - DataFrame to modify
    col_name - name of the column where we look for the value
    index - list of indexes to which we comapre our value"""
    j = 1
    for i in df[col_name]:
        if i not in index:
            df = df.drop(j, axis=0)
        j += 1

drop_rows(data_w, "nl_code", users_by_nl.index)

## Exploratory Analysis 

In [None]:
# Let's look at correlations by gender
crosstab_nl = pd.crosstab(data_w['nl_code'],data_w[Target[0]])
crosstab_nl.head()

In [None]:
# Merge data
crosstab = pd.merge(crosstab_nl, users_by_nl, on='nl_code')
crosstab.columns = ['female', 'male', 'nb_users']
print(crosstab)
print('\n')
print("-"*10)
print('\n')
print(crosstab.describe(include='all'))

In [None]:
# Convert to precentage
#start = time.time()
#for col in range(2):
#    for row in range(crosstab_nl.shape[0]):
#        crosstab_nl[col][row] = (crosstab_nl[col][row] / (crosstab_nl[0][row] + crosstab_nl[1][row])) * 100.
#end = time.time()
#print(end - start)

In [None]:
#correlation heatmap
#def correlation_heatmap(df):
#    """ df.corr Compute pairwise correlation of columns,
#    excluding NA/null values."""
#    _ , ax = plt.subplots(figsize=(14, 12))
#    colormap = sns.diverging_palette(220, 10, as_cmap=True)
    
#    _ = sns.heatmap(
#        df.corr(),
#        cmap = colormap,
#        square = True,
#        cbar_kws = {'shrink':.9},
#        ax = ax,
#        annot = True, 
#        linewidths = 0.1, vmax = 1.0, linecolor = 'white',
#        annot_kws = {'fontsize':12}
#    )
    
#    plt.title =('Pearson Correlation of features')
    
#correlation_heatmap(data_cleaned)

##### / ! \ How to interpret this ? Normally, the closest the correlation is to |1|, the most significant the correlation is. However it seems here that some of the data was misinterpreted due to encoding ...  

## Modelling 

##### Knowing we are in a supervised learning context and that the variable we are trying to predict is discrete, we can use a classification algorithm. 

In [None]:
# Algorith selection and initialization (from sklearn)
MLA = [
    #Ensemble Methods
    ensemble.AdaBoostClassifier(),
    ensemble.BaggingClassifier(),
    ensemble.ExtraTreesClassifier(),
    ensemble.GradientBoostingClassifier(),
    ensemble.RandomForestClassifier(),
    
    #Gaussian Process
    gaussian_process.GaussianProcessClassifier(),
    
    #GLM
    linear_model.LogisticRegressionCV(),
    linear_model.RidgeClassifierCV(),
    linear_model.SGDClassifier(),
    linear_model.Perceptron(),
    
    #Naive Bayes
    naive_bayes.BernoulliNB(),
    naive_bayes.GaussianNB(),
    
    #SVM
    svm.LinearSVC(),
    
    #Tree
    tree.DecisionTreeClassifier(),

    #xgboost
    XGBClassifier()
    ]

# run model 10x with 60/30 split intentionally leaving out 10%
cv_split = model_selection.ShuffleSplit(n_splits = 10, \
            test_size = .3, train_size = .6, random_state = 0)
#print (cv_split)

# create table to compare models
MLA_columns = ['MLA Name', 'MLA Parameters','MLA Train Accuracy Mean', \
               'MLA Test Accuracy Mean', 'MLA Test Accuracy 3*STD' ,'MLA Time']
MLA_compare = pd.DataFrame(columns = MLA_columns)

# create table to compare predictions 
MLA_predict = data_w[Target]

# index through models and save performance to table
row_index = 0
for alg in MLA:
    #give names and set parameters
    MLA_name = alg.__class__.__name__
    MLA_compare.loc[row_index, 'MLA Name'] = MLA_name
    MLA_compare.loc[row_index, 'MLA Parameters'] = str(alg.get_params())
    
    #score models with cross validation
    cv_results = model_selection.cross_validate(alg, data_w['nl_code'], \
            data_w['gender_code'], cv = cv_split)
    
    MLA_compare.loc[row_index, 'MLA Time'] = cv_results['fit_time'].mean()
    MLA_compare.loc[row_index, 'MLA Train Accuracy Mean'] = cv_results['train_score'].mean()
    MLA_compare.loc[row_index, 'MLA Test Accuracy Mean'] = cv_results['test_score'].mean()
    #if this is a non-bias random sample, then +/-3 standard deviations (std) from the mean, 
    #should statistically capture 99.7% of the subsets
    MLA_compare.loc[row_index, 'MLA Test Accuracy 3*STD'] = cv_results['test_score'].std()*3
    
    #save MLA predictions 
    alg.fit(data_w['nl_code'], data_w[Target])
    MLA_predict[MLA_name] = alg.predict(data_w['nl_code'])
    
    row_index+=1
    
# print and sort table:
MLA_compare.sort_values(by = ['MLA Test Accuracy Mean'], ascending = False, inplace = True)
MLA_compare

    

In [None]:
data_w['nl_code'].shape

In [None]:
data_w['gender_code'].shape