In [25]:
from pathlib import Path

import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder

In [26]:
# loading the dataset
ok = pd.read_csv("raw_data/okcupid_explore.csv")

In [27]:
ok.columns

Index(['Unnamed: 0', 'age', 'status', 'sex', 'orientation', 'body_type',
       'diet', 'drinks', 'drugs', 'education', 'ethnicity', 'height', 'income',
       'job', 'last_online', 'location', 'offspring', 'pets', 'religion',
       'sign', 'smokes', 'speaks'],
      dtype='object')

In [28]:
ok = ok.drop(columns=["Unnamed: 0"])

In [29]:
ok.head()

Unnamed: 0,age,status,sex,orientation,body_type,diet,drinks,drugs,education,ethnicity,...,income,job,last_online,location,offspring,pets,religion,sign,smokes,speaks
0,22,single,m,straight,a little extra,strictly anything,socially,never,working on college/university,"asian, white",...,-1,transportation,2012-06-28-20-30,"south san francisco, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism and very serious about it,gemini,sometimes,english
1,35,single,m,straight,average,mostly other,often,sometimes,working on space camp,white,...,80000,hospitality / travel,2012-06-29-21-41,"oakland, california","doesn't have kids, but might want them",likes dogs and likes cats,agnosticism but not too serious about it,cancer,no,"english (fluently), spanish (poorly), french (..."
2,38,available,m,straight,thin,anything,socially,NoDrugs,graduated from masters program,NotDisclosed,...,-1,NotDisclosed,2012-06-27-09-10,"san francisco, california",NotDisclosed,has cats,NoReligion,pisces but it doesn&rsquo;t matter,no,"english, french, c++"
3,23,single,m,straight,thin,vegetarian,socially,NoDrugs,working on college/university,white,...,20000,student,2012-06-28-14-22,"berkeley, california",doesn't want kids,likes cats,NoReligion,pisces,no,"english, german (poorly)"
4,29,single,m,straight,athletic,NoDiet,socially,never,graduated from college/university,"asian, black, other",...,-1,artistic / musical / writer,2012-06-27-21-26,"san francisco, california",NotDisclosed,likes dogs and likes cats,NoReligion,aquarius,no,english


## missing data

In [30]:
# % of missing values in each column
round(ok.isnull().sum().sort_values(ascending=False)/len(ok), 2)

height         0.0
status         0.0
sex            0.0
orientation    0.0
age            0.0
body_type      0.0
diet           0.0
drugs          0.0
drinks         0.0
education      0.0
ethnicity      0.0
income         0.0
job            0.0
last_online    0.0
location       0.0
offspring      0.0
pets           0.0
religion       0.0
sign           0.0
smokes         0.0
speaks         0.0
dtype: float64

## decide what to do for each column with missing values...
Investigate missing values in each column then choose one of the following solutions:

1. Drop the column entirely
2. Impute the column median using `SimpleImputer` from Scikit-Learn
3. Preserve the NaNs and replace them with meaningful values

Make changes effective in the dataframe using the chosen method.

### check out values in each column

In [31]:
for feature in ok.columns:
    print(f'{feature}: {ok[feature].unique()}')

age: [ 22  35  38  23  29  32  31  24  37  28  30  39  33  26  27  20  25  40
  36  21  34  43  46  41  42  45  18  55  50  59  44  48  54  51  62  52
  19  58  66  53  63  47  49  61  60  57  56  65  64  68 110  69  67 109]
status: ['single' 'available' 'seeing someone' 'married' 'unknown']
sex: ['m' 'f']
orientation: ['straight' 'bisexual' 'gay']
body_type: ['a little extra' 'average' 'thin' 'athletic' 'fit' 'RatherNotSay'
 'skinny' 'curvy' 'full figured' 'jacked' 'used up' 'overweight']
diet: ['strictly anything' 'mostly other' 'anything' 'vegetarian' 'NoDiet'
 'mostly anything' 'mostly vegetarian' 'strictly vegan'
 'strictly vegetarian' 'mostly vegan' 'strictly other' 'mostly halal'
 'other' 'vegan' 'mostly kosher' 'strictly halal' 'halal'
 'strictly kosher' 'kosher']
drinks: ['socially' 'often' 'not at all' 'rarely' 'NotDisclosed' 'very often'
 'desperately']
drugs: ['never' 'sometimes' 'NoDrugs' 'often']
education: ['working on college/university' 'working on space camp'
 'gradua

### `offspring`

In [32]:
# unique values and their count
ok.offspring.value_counts(dropna=False)

offspring
NotDisclosed                               35561
doesn't have kids                           7560
doesn't have kids, but might want them      3875
doesn't have kids, but wants them           3565
doesn't want kids                           2927
has kids                                    1883
has a kid                                   1881
doesn't have kids, and doesn't want any     1132
has kids, but doesn't want more              442
has a kid, but doesn't want more             275
has a kid, and might want more               231
wants kids                                   225
might want kids                              182
has kids, and might want more                115
has a kid, and wants more                     71
has kids, and wants more                      21
Name: count, dtype: int64

In [33]:
ok.offspring.str.contains("has")

0        False
1        False
2        False
3        False
4        False
         ...  
59941     True
59942    False
59943    False
59944    False
59945    False
Name: offspring, Length: 59946, dtype: bool

In [40]:
ok["has_kids"] = -1
ok["wants_kids"] = -1
ok.loc[ok.offspring == "NotDisclosed","has_kids"] = 0
ok.loc[ok.offspring == "NotDisclosed","wants_kids"] = 0
# missing value or not informed -> has_kids = 0, wants_kids = 0 

In [41]:
ok.loc[ok.offspring.str.contains("has"),"has_kids"] = 1
# person has kids =  -> has_kids = 1
ok.loc[ok.offspring.str.contains("wants") | ok.offspring.str.contains("might want"), "wants_kids"] = 1
# person wants kids =  -> wants_kids = 1

In [42]:
ok.loc[ok.has_kids == -1,"has_kids"] = 2
# person does not have kids -> has_kids = 2
ok.loc[ok.wants_kids == -1,"wants_kids"] = 2
# person does not want kids -> wants_kids = 2

missing value -> 0; 
has / wants = true -> 1;
does not have / want = false -> 2.

In [39]:
ok["has_kids"].value_counts()

has_kids
0    35561
2    19466
1     4919
Name: count, dtype: int64

In [43]:
ok["wants_kids"].value_counts()

wants_kids
0    35561
2    16100
1     8285
Name: count, dtype: int64

### `diet`

nina vai mexer nessas colunas:
‘education’, ‘ethnicity’, ‘location’,‘religion’, ‘sign’, ‘speaks’.

In [18]:
# unique values and their count
ok.diet.value_counts(dropna=False)

diet
NoDiet                 24395
mostly anything        16585
anything                6183
strictly anything       5113
mostly vegetarian       3444
mostly other            1007
strictly vegetarian      875
vegetarian               667
strictly other           452
mostly vegan             338
other                    331
strictly vegan           228
vegan                    136
mostly kosher             86
mostly halal              48
strictly halal            18
strictly kosher           18
halal                     11
kosher                    11
Name: count, dtype: int64

In [None]:
diet na
diet type -> anything, vegetarian, vegan, other (kosher, halal)

### `religion`

NINA

In [None]:
# unique values and their count
ok.religion.value_counts(dropna=False)

In [None]:
religion na
religion type -> anything, agnosticism, atheism, buddhism, catholicism, christianity, hinduism, islam, judaism, other
religion serious -> empty, not too much, somewhat, very, laughing

### `pets`

In [None]:
# unique values and their count
ok.pets.value_counts(dropna=False)

### `drugs`

In [None]:
# unique values and their count
ok.drugs.value_counts(dropna=False)

### `sign`

NINA

In [None]:
# unique values and their count
ok.sign.value_counts(dropna=False)

### `job`

In [None]:
# unique values and their count
ok.job.value_counts(dropna=False)

### `education`

NINA

In [None]:
# unique values and their count
ok.education.value_counts(dropna=False)

### `ethnicity`

NINA

In [None]:
# unique values and their count
ok.ethnicity.value_counts(dropna=False)

In [None]:
ok.ethnicity.unique()

### `smokes`

In [None]:
# unique values and their count
ok.smokes.value_counts(dropna=False)

### `body type`

In [None]:
# unique values and their count
ok.body_type.value_counts(dropna=False)

### `drinks`

In [None]:
# unique values and their count
ok.drinks.value_counts(dropna=False)

### `speaks`

NINA

In [None]:
# unique values and their count
ok.speaks.value_counts(dropna=False)

## writing csv

In [None]:
filepath = Path('raw_data/okcupid_categories.csv') 
filepath.parent.mkdir(parents=True, exist_ok=True)

In [None]:
ok.to_csv(filepath) 

## option number 2 -> imputing value

In [None]:
'''
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(strategy="median") 
# Instantiate a SimpleImputer object with strategy of choice

imputer.fit(data[['RoofSurface']]) 
# Call the "fit" method on the object

data['RoofSurface'] = imputer.transform(data[['RoofSurface']]) 
# Call the "transform" method on the object
'''

## Scaling