In [443]:
#Importing all libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import os
from itertools import combinations
from scipy.stats import chi2_contingency
import category_encoders as ce
from sklearn.preprocessing import StandardScaler


In [382]:
#Importing dataset cleaned and filtered from EDA
profiles = pd.read_csv('../data/profiles_eda.csv', index_col=False)
profiles = profiles.loc[:, ~profiles.columns.str.contains('^Unnamed')]
profiles.head()

Unnamed: 0,age,body_type,diet,drinks,drugs,education,ethnicity,height,job,offspring,orientation,religion,sex,sign,smokes,status,dogs,cats
0,22,larger,anything,a little,never,college,asian,75.0,service,no kids,straight,agnosticism,m,gemini,yes,single,likes dogs,likes cats
1,35,average,other,a lot,sometimes,other,white,70.0,service,no kids,straight,agnosticism,m,cancer,no,single,likes dogs,likes cats
2,38,thin,anything,a little,unknown,masters,unknown,68.0,unknown,unknown,straight,unknown,m,pisces,no,available,no dogs,has cats
3,23,thin,vegetarian,a little,unknown,college,white,71.0,student,unsure,straight,unknown,m,pisces,no,single,no dogs,likes cats
4,29,fit,unknown,a little,never,college,asian,66.0,creative,unknown,straight,unknown,m,aquarius,no,single,likes dogs,likes cats


### 01.- Encoding categorical variables

#### Encoding 'body_type'

In [383]:
#checking unique variables
print(profiles['body_type'].value_counts())

body_type
fit        24951
average    14652
thin        6488
unknown     5293
curvy       4933
larger      3073
other        553
Name: count, dtype: int64


Ordinal encoding seems to be the best option for this variable (degree of fitness). the value 'unknown' introduces a type of body that does not follow any order among the body types. Imputation with the mode would introduce bias to the variable, so best option is to impute by assuming 'unknown' = 'average'

In [384]:
profiles['body_type'] = profiles['body_type'].replace('unknown', 'average')
profiles['body_type'] = profiles['body_type'].replace('other', 'average')
body_type_mapping = {
    'thin': 0,
    'average': 1,
    'fit': 2,
    'curvy': 3,
    'larger': 4
}

profiles['body_type'] = profiles['body_type'].map(body_type_mapping)

In [385]:
print(profiles['body_type'].value_counts())

body_type
2    24951
1    20498
0     6488
3     4933
4     3073
Name: count, dtype: int64


#### Encoding 'diet'

In [386]:
#checking unique variables
print(profiles['diet'].value_counts())

diet
anything      27881
unknown       24392
vegetarian     4986
other          1790
vegan           702
kosher          115
halal            77
Name: count, dtype: int64


Imputation of the value 'unknown' with the mode assuming people give a lot of importance to saying their diet in very specific cases

In [387]:
profiles['diet'] = profiles['diet'].replace('unknown', 'anything')

In [388]:
#One-Hot encoding for the value 'diet'
profiles = pd.get_dummies(profiles, columns=['diet'])

#### Encoding 'drinks'

In [389]:
#checking unique variables
print(profiles['drinks'].value_counts())

drinks
a little      47737
a lot          5957
not at all     3267
unknown        2982
Name: count, dtype: int64


In [390]:
#Imputation of the value 'unknown' with the mode 'a little'
profiles['drinks'] = profiles['drinks'].replace('unknown', 'a little')

In [391]:
drinks_mapping = {
    'not at all': 0,
    'a little': 1,
    'a lot': 2    
}

profiles['drinks'] = profiles['drinks'].map(drinks_mapping)

Encoding 'drugs'

In [392]:
#checking unique values with total counts
count_drg_unk = profiles['drugs'].value_counts(0)
print(count_drg_unk)
#checking unique values with proportions
print(profiles['drugs'].value_counts(1))

drugs
never        37723
unknown      14078
sometimes     7732
often          410
Name: count, dtype: int64
drugs
never        0.629315
unknown      0.234856
sometimes    0.128989
often        0.006840
Name: proportion, dtype: float64


In [393]:
#checking unique variables
proportions_drugs = profiles.loc[profiles['drugs'] != 'unknown', 'drugs'].value_counts(normalize=True)
print(proportions_drugs)

drugs
never        0.822479
sometimes    0.168582
often        0.008939
Name: proportion, dtype: float64


Imputation will assign to 'unknown' three different values: 'never', 'sometimes', 'often' in the same proportions they repeat along the dataset. </br>
That is, for all 14078 values the following substitutions: </br>

In [394]:
print(f"'never': {int(round(count_drg_unk.iloc[1] * proportions_drugs.iloc[0],0))} values")
print(f"'sometimes': {int(round(count_drg_unk.iloc[1] * proportions_drugs.iloc[1],0))} values")
print(f"'often': {int(round(count_drg_unk.iloc[1] * proportions_drugs.iloc[2],0))} values")


'never': 11579 values
'sometimes': 2373 values
'often': 126 values


In [395]:
# Find the indices where drugs == 'unknown'
unknown_idx = profiles[profiles['drugs'] == 'unknown'].index

# Number of unknowns
n_unknown = len(unknown_idx)

# Calculate how many to assign each new value
n_never = int(round(proportions_drugs.iloc[0],0) * n_unknown)
n_sometimes = int(round(proportions_drugs.iloc[1],0) * n_unknown)
n_often = int((round(proportions_drugs.iloc[2],0) * n_unknown)+2)

# Shuffle indices for random assignment
shuffled_idx = np.random.permutation(unknown_idx)

# Assign 'never'
profiles.loc[shuffled_idx[:n_never], 'drugs'] = 'never'

# Assign 'sometimes'
profiles.loc[shuffled_idx[n_never:n_never+n_sometimes], 'drugs'] = 'sometimes'

# Assign 'often'
profiles.loc[shuffled_idx[n_never+n_sometimes:n_never+n_sometimes+n_often], 'drugs'] = 'often'

In [396]:
profiles['drugs'].value_counts()

drugs
never        51801
sometimes     7732
often          410
Name: count, dtype: int64

In [397]:
#ordinal encoding for 'drugs'
drugs_mapping = {
    'never': 0,
    'sometimes': 1,
    'often': 2    
}

profiles['drugs'] = profiles['drugs'].map(drugs_mapping)

Encoding 'education'

In [398]:
print(profiles['education'].value_counts())

education
college             31467
masters             10920
unknown              6625
two_year_college     3018
phd                  2408
high_school          1713
other                1683
law                  1428
med                   681
Name: count, dtype: int64


In [399]:
count_educ = profiles['education'].value_counts()
print(count_educ)

print(profiles['education'].value_counts(1))

education
college             31467
masters             10920
unknown              6625
two_year_college     3018
phd                  2408
high_school          1713
other                1683
law                  1428
med                   681
Name: count, dtype: int64
education
college             0.524949
masters             0.182173
unknown             0.110522
two_year_college    0.050348
phd                 0.040171
high_school         0.028577
other               0.028077
law                 0.023823
med                 0.011361
Name: proportion, dtype: float64


In [400]:
proportions_educ = (profiles.loc[profiles['education'] != 'unknown', 'education']).value_counts(1)
print(proportions_educ)

education
college             0.590176
masters             0.204809
two_year_college    0.056604
phd                 0.045163
high_school         0.032128
other               0.031565
law                 0.026783
med                 0.012772
Name: proportion, dtype: float64


In [401]:
proportions_educ.index[0]

'college'

Defining a function for imputation of 'unknown' value according to the distribution of the rest of the values

In [402]:
def prop_imputer(variable):
    proportions = (profiles.loc[profiles[variable] != 'unknown', variable]).value_counts(1)
    # Find the indices where variable == 'unknown'
    unknown_idx = profiles[profiles[variable] == 'unknown'].index
    # Shuffle indices for random assignment
    shuffled_idx = np.random.permutation(unknown_idx)
    # Number of unknowns
    n_unknown = len(unknown_idx)
    # Calculate how many to assign each new value
    n_assign = []
    for props in range(len(proportions)):
        n_assign.append(int(round(proportions.iloc[props] * n_unknown, 0)))
        start = 0
        for i in range(len(n_assign)):
            # Assign value
            profiles.loc[shuffled_idx[start:start + n_assign[i]], variable] = proportions.index[i]
            start += n_assign[i]
    return print(f"new values: {profiles[variable].value_counts()}")
    

In [403]:
prop_imputer(variable = 'education')

new values: education
college             35377
masters             12277
two_year_college     3393
phd                  2707
high_school          1926
other                1892
law                  1605
med                   766
Name: count, dtype: int64


Ordinal encoding for the variable 'education'

In [404]:
#mapping the variable 'education'
education_mapping = {
    'other': 0,
    'high_school': 1,
    'two_year_college': 2,
    'college': 3,
    'masters': 4,
    'phd': 5,
    'law': 6,
    'med': 7
}

profiles['education'] = profiles['education'].map(education_mapping)
profiles['education'].value_counts()

education
3    35377
4    12277
2     3393
5     2707
1     1926
0     1892
6     1605
7      766
Name: count, dtype: int64

Encoding 'ethnicity'

In [405]:
#defining a function for reusable code
def pre_encode_cat(variable): #this function prints the preliminar proportions and counts before encoding
    count = profiles[variable].value_counts()
    print(f"Value counts for {count}")
    print(f"Proportions for {profiles[variable].value_counts(1)}")
    proportions = (profiles.loc[profiles[variable] != 'unknown', variable]).value_counts(1)
    print(f"Proportions without the value 'unknown' {proportions}")

In [406]:
#counts and proportions for variable 'ethnicity'
pre_encode_cat('ethnicity')


Value counts for ethnicity
white       33472
asian        8205
unknown      5679
hispanic     4378
black        3071
other        1705
indian       1196
middle        811
pacific       717
native        709
Name: count, dtype: int64
Proportions for ethnicity
white       0.558397
asian       0.136880
unknown     0.094740
hispanic    0.073036
black       0.051232
other       0.028444
indian      0.019952
middle      0.013530
pacific     0.011961
native      0.011828
Name: proportion, dtype: float64
Proportions without the value 'unknown' ethnicity
white       0.616836
asian       0.151205
hispanic    0.080680
black       0.056594
other       0.031420
indian      0.022040
middle      0.014945
pacific     0.013213
native      0.013066
Name: proportion, dtype: float64


In [407]:
#imputation of the variable 'ethnicity' using the function prop_imputer
prop_imputer('ethnicity')


new values: ethnicity
white       36975
asian        9064
hispanic     4836
black        3392
other        1883
indian       1321
middle        896
pacific       792
native        783
unknown         1
Name: count, dtype: int64


In [408]:
#forcing the remaining unknown value to be equal to the mode (to be fixed later)
profiles['ethnicity'] = profiles['ethnicity'].replace('unknown', 'white')
print(profiles['ethnicity'].value_counts())

ethnicity
white       36976
asian        9064
hispanic     4836
black        3392
other        1883
indian       1321
middle        896
pacific       792
native        783
Name: count, dtype: int64


In [409]:
#One-Hot encoding for the value 'diet'
profiles = pd.get_dummies(profiles, columns=['ethnicity'])

Encoding the variable 'job'

In [410]:
pre_encode_cat('job')

Value counts for job
unknown        16221
stem            9557
business        9030
creative        6688
student         4882
health          3680
service         3556
education       3513
law             1381
government       912
not_working      523
Name: count, dtype: int64
Proportions for job
unknown        0.270607
stem           0.159435
business       0.150643
creative       0.111573
student        0.081444
health         0.061392
service        0.059323
education      0.058606
law            0.023039
government     0.015214
not_working    0.008725
Name: proportion, dtype: float64
Proportions without the value 'unknown' job
stem           0.218586
business       0.206532
creative       0.152966
student        0.111660
health         0.084168
service        0.081332
education      0.080349
law            0.031586
government     0.020859
not_working    0.011962
Name: proportion, dtype: float64


One-hot encoding for tha variable 'job' without imputation of unknown variables, considering the amount of missingness, 'unknown' will have its own category

In [411]:
profiles = pd.get_dummies(profiles, columns=['job'])

Encoding the variable 'offspring'

In [412]:
pre_encode_cat('offspring')

Value counts for offspring
unknown     35559
no kids     16131
has kids     4919
unsure       3334
Name: count, dtype: int64
Proportions for offspring
unknown     0.593214
no kids     0.269106
has kids    0.082061
unsure      0.055620
Name: proportion, dtype: float64
Proportions without the value 'unknown' offspring
no kids     0.661540
has kids    0.201731
unsure      0.136729
Name: proportion, dtype: float64


Merging 'unknown' and 'unsure' as 'unknown', then One-Hot encoding, so 'unknown' will have its own category

In [413]:
profiles['offspring'] = profiles['offspring'].replace('unsure', 'unknown')
profiles = pd.get_dummies(profiles, columns=['offspring'])

One-Hot Encoding 'orientation' since it has no 'unknown' values

In [414]:
profiles = pd.get_dummies(profiles, columns=['orientation'])

In [415]:
pre_encode_cat('religion')

Value counts for religion
unknown         20223
agnosticism      8812
other            7743
atheism          6985
christianity     5787
catholicism      4758
judaism          3098
buddhism         1948
hinduism          450
islam             139
Name: count, dtype: int64
Proportions for religion
unknown         0.337371
agnosticism     0.147006
other           0.129173
atheism         0.116527
christianity    0.096542
catholicism     0.079375
judaism         0.051682
buddhism        0.032498
hinduism        0.007507
islam           0.002319
Name: proportion, dtype: float64
Proportions without the value 'unknown' religion
agnosticism     0.221853
other           0.194940
atheism         0.175856
christianity    0.145695
catholicism     0.119789
judaism         0.077996
buddhism        0.049043
hinduism        0.011329
islam           0.003499
Name: proportion, dtype: float64


In [416]:
#Merging 'other' and 'unknown'
profiles['religion'] = profiles['religion'].replace('other', 'unknown')

In [417]:
#One-hot encoding religion
profiles = pd.get_dummies(profiles, columns=['religion'])

Encoding 'sign' with One-Hot

In [418]:
pre_encode_cat('sign')

Value counts for sign
unknown        11054
leo             4374
gemini          4310
libra           4207
cancer          4206
virgo           4141
taurus          4140
scorpio         4134
aries           3988
pisces          3946
sagittarius     3942
aquarius        3928
capricorn       3573
Name: count, dtype: int64
Proportions for sign
unknown        0.184409
leo            0.072969
gemini         0.071902
libra          0.070183
cancer         0.070167
virgo          0.069082
taurus         0.069066
scorpio        0.068966
aries          0.066530
pisces         0.065829
sagittarius    0.065762
aquarius       0.065529
capricorn      0.059607
Name: proportion, dtype: float64
Proportions without the value 'unknown' sign
leo            0.089468
gemini         0.088159
libra          0.086052
cancer         0.086032
virgo          0.084702
taurus         0.084682
scorpio        0.084559
aries          0.081573
pisces         0.080713
sagittarius    0.080632
aquarius       0.080345
capr

The frequencies are roughly equally distributed, therefore imputation of the value 'unknown' will be split proportionally for each sign, so the final variable will still be equally distributed

In [419]:
prop_imputer('sign')

new values: sign
leo            5363
gemini         5285
libra          5158
cancer         5157
virgo          5077
taurus         5076
scorpio        5069
aries          4890
pisces         4838
sagittarius    4833
aquarius       4816
capricorn      4381
Name: count, dtype: int64


One-Hot encoding the variable 'sign'

In [420]:
profiles = pd.get_dummies(profiles, columns=['sign'])

Encoding the variable 'smokes' 

In [421]:
pre_encode_cat('smokes')

Value counts for smokes
no              46126
yes              8307
not answered     5510
Name: count, dtype: int64
Proportions for smokes
no              0.769498
yes             0.138582
not answered    0.091921
Name: proportion, dtype: float64
Proportions without the value 'unknown' smokes
no              0.769498
yes             0.138582
not answered    0.091921
Name: proportion, dtype: float64


In [422]:
#replacing 'not answered' with 'unknown' to make the imputer function work
profiles['smokes'] = profiles['smokes'].replace('not answered', 'unknown')


Proportional imputation

In [423]:
prop_imputer('smokes')

new values: smokes
no     50795
yes     9148
Name: count, dtype: int64


In [424]:
profiles['smokes'].head()

0    yes
1     no
2     no
3     no
4     no
Name: smokes, dtype: object

Binary encoding for the variable 'smokes'

In [425]:
profiles['smokes'] = profiles['smokes'].map({'yes': 1, 'no': 0})


In [426]:
pre_encode_cat('status')

Value counts for status
single            55694
seeing someone     2064
available          1865
married             310
unknown              10
Name: count, dtype: int64
Proportions for status
single            0.929116
seeing someone    0.034433
available         0.031113
married           0.005172
unknown           0.000167
Name: proportion, dtype: float64
Proportions without the value 'unknown' status
single            0.929271
seeing someone    0.034438
available         0.031118
married           0.005172
Name: proportion, dtype: float64


'unknown' only represents 0,01% of the total, let's replace those values with the mode 'single'

In [427]:
profiles['status'] = profiles['status'].replace('unknown')

  profiles['status'] = profiles['status'].replace('unknown')


One-hot encoding for the variable 'status'

In [428]:
profiles = pd.get_dummies(profiles, columns=['status'])

Encoding the variable 'sex' as binary

In [431]:
profiles['sex'] = profiles['sex'].map({'m': 1, 'f': 0})

Encoding the variables 'dogs' and 'cats' with One-Hot Encoding

In [436]:
profiles = pd.get_dummies(profiles, columns=['dogs'])
profiles = pd.get_dummies(profiles, columns=['cats'])

In [438]:
print(f"Total number of features = {len(profiles.columns)}")

Total number of features = 71


### 02.- Transformation of numerical variables

In [441]:
num_var = profiles.select_dtypes(include='number')
print(num_var)

       age  body_type  drinks  drugs  education  height  sex  smokes
0       22          4       1      0          3    75.0    1       1
1       35          1       2      1          0    70.0    1       0
2       38          0       1      0          4    68.0    1       0
3       23          0       1      0          3    71.0    1       0
4       29          2       1      0          3    66.0    1       0
...    ...        ...     ...    ...        ...     ...  ...     ...
59938   59          1       1      0          3    62.0    0       0
59939   24          2       2      1          3    72.0    1       0
59940   42          1       0      0          4    71.0    1       0
59941   27          2       1      2          3    73.0    1       1
59942   39          1       1      0          4    68.0    1       1

[59943 rows x 8 columns]


In [442]:
print(profiles.head())

   age  body_type  drinks  drugs  education  height  sex  smokes  \
0   22          4       1      0          3    75.0    1       1   
1   35          1       2      1          0    70.0    1       0   
2   38          0       1      0          4    68.0    1       0   
3   23          0       1      0          3    71.0    1       0   
4   29          2       1      0          3    66.0    1       0   

   diet_anything  diet_halal  ...  status_available  status_married  \
0           True       False  ...             False           False   
1          False       False  ...             False           False   
2           True       False  ...              True           False   
3          False       False  ...             False           False   
4           True       False  ...             False           False   

   status_seeing someone  status_single  dogs_has dogs  dogs_likes dogs  \
0                  False           True          False             True   
1             

Applying standardization to 'age' and 'height'

In [444]:
scaler = StandardScaler()
profiles[['age', 'height']] = scaler.fit_transform(profiles[['age', 'height']])


In [445]:
profiles.head()

Unnamed: 0,age,body_type,drinks,drugs,education,height,sex,smokes,diet_anything,diet_halal,...,status_available,status_married,status_seeing someone,status_single,dogs_has dogs,dogs_likes dogs,dogs_no dogs,cats_has cats,cats_likes cats,cats_no cats
0,-1.093889,4,1,0,3,1.678374,1,1,True,False,...,False,False,False,True,False,True,False,False,True,False
1,0.281388,1,2,1,0,0.426738,1,0,False,False,...,False,False,False,True,False,True,False,False,True,False
2,0.598759,0,1,0,4,-0.073917,1,0,True,False,...,True,False,False,False,False,False,True,True,False,False
3,-0.988098,0,1,0,3,0.677065,1,0,False,False,...,False,False,False,True,False,False,True,False,True,False
4,-0.353355,2,1,0,3,-0.574571,1,0,True,False,...,False,False,False,True,False,True,False,False,True,False


### 03- Exporting this dataset for dimensionality reduction

In [446]:
profiles.to_csv('profiles_processed.csv')