In [48]:
import pandas as pd
import numpy as np

data = pd.read_csv('profiles.csv')

#EDA

#print(data.head())
#print(data.describe())
#print(data.info())
#print(data.shape)
#print(data.duplicated())
#data.isnull().mean().sort_values(ascending=False)  # for percentage
#data.isnull().sum().sort_values(ascending=False)

#fill null essays with empty string
essay_cols = [col for col in data.columns if 'essay' in col]
data[essay_cols] = data[essay_cols].fillna('')

#drop 3 records without height (was thinking about filling those values with the mean, but since there are only 3 rows, decided to drop them)
data = data.dropna(subset=['height'])

#fill categorical columns with unkown
cat_cols = data.columns
print(cat_cols)
data[cat_cols] = data[cat_cols].fillna('unknown')

#print(data.isnull().sum().sort_values(ascending=False))
#print(data.head())

#group body types - Group body types into slim, average, fit, plus or unknown
def group_body_types(bt):
    if bt in ['thin', 'skinny']:
        return 'slim'
    elif bt == 'average':
        return 'average'
    elif bt in ['athletic', 'fit', 'jacked']: 
        return 'fit'
    elif bt in ['a little extra', 'curvy', 'full figured', 'overweight']: 
        return 'plus'
    else:
        return 'unknown'

data.body_type = data.body_type.apply(group_body_types)

#Diet - for diet there are 2 pieces of info in this column, so I will divide into diet type and diet strictness
def fill_diet_type(diet):
    parts = diet.split(' ')
    if len(parts) == 1:
        return parts[0]
    else:
        return parts[1]
    
data['diet_type'] = data['diet'].apply(fill_diet_type)

def fill_diet_strictness(diet):
    parts = diet.split(' ')
    if len(parts) == 1 and parts[0] == 'unknown':
        return 'unknown'
    elif len(parts) == 1: 
        return 'neutral'
    else:
        return parts[0]

data['diet_strictness'] = data['diet'].apply(fill_diet_strictness)

strict_dict = {
    'strictly': 'strict',
    'mostly': 'flexible',
    'neutral': 'standard',
    'unknown': 'unknown'
}

data['diet_strictness'] = data['diet_strictness'].map(strict_dict)

#drinks - almost perfect, just changed some labels and grouped 2 fields
drinks_dict = {
    'unknown': 'unknown',
    'not at all': 'non-drinker',
    'rarely': 'light',
    'socially': 'moderate',
    'often': 'heavy',
    'very often': 'heavy',
    'desperately': 'very heavy',    
}

data.drinks = data.drinks.map(drinks_dict)

#drugs - good to go
#education - STILL NEEDS TO BE PROPER CLEANED CHECK THE UNIQUE VALUES AND GROUP THEM AFTER FINISHING CLEANING

#print(data.education.unique())

def split_education(edu): 
    if edu is None or pd.isna(edu) or edu == 'unknown' or edu == '':
        return pd.Series(['unknown', 'unknown'])
    parts = edu.split(' ', 2)
    if len(parts) == 1:
        return pd.Series(['graduated from', parts[0]])
    if len(parts) == 2:
        return pd.Series(['graduated from', parts[0] + ' ' + parts[1]])
    if len(parts) == 3:
        status = parts[0] + ' ' + parts[1]
        level = parts[2]
        return pd.Series([status, level])

data[['education_status', 'education_level']] = data['education'].apply(split_education)

map_edu = {
    'college/university': 'college',
    'space camp': 'unknown',
    'masters program': 'masters',
    'two-year college': 'college',
    'unknown': 'unknown',
    'high school': 'high school',
    'of space camp': 'unknown',
    'ph.d program': 'phd',
    'law school': 'law school',
    'med school': 'med school',
    'of college/university': 'college',
    'of high school': 'high school',
    'of ph.d program': 'phd',
    'of two-year college': 'college',
    'of med school': 'med school',
    'of masters program': 'masters',
    'of law school': 'law school'
}

edu_status_map = {
    'working on': 'in progress',
    'graduated from': 'finished',
    'unknown': 'unknown',
    'dropped out': 'dropped out'
}

data['education_level'] = data['education_level'].map(map_edu)
data['education_status'] = data['education_status'].map(edu_status_map)

#print(data.education_status.unique())
#print(data.education_level.unique())

#ethnicity
#print(data.ethnicity.unique())

def get_primary_race(race):
    if race == '' or race == 'unknown':
        return 'unknown'
    else:
        return race.split(',')[0].strip().lower()

data['ethnicity'] = data['ethnicity'].apply(get_primary_race)

#print(data.ethnicity.unique())

#job
#print(data.job.unique())

career_map = {
    'science / tech / engineering': 'STEM',
    'computer / hardware / software': 'STEM',
    
    'medicine / health': 'Healthcare',
    
    'education / academia': 'Education',
    
    'banking / financial / real estate': 'Business',
    'sales / marketing / biz dev': 'Business',
    'executive / management': 'Business',
    
    'artistic / musical / writer': 'Creative',
    'entertainment / media': 'Creative',
    
    'hospitality / travel': 'Service',
    'clerical / administrative': 'Service',
    'construction / craftsmanship': 'Service',
    
    'political / government': 'Government / Law',
    'law / legal services': 'Government / Law',
    'military': 'Government / Law',
    
    'transportation': 'Transportation',
    
    'student': 'Student',
    'unemployed': 'Unemployed',
    'retired': 'Retired',
    
    'rather not say': 'Other',
    'other': 'Other',
    'unknown': 'Other'
}

data['job'] = data['job'].map(career_map).fillna('Other')

#print(data.job.unique())

#last_online
#print(data.last_online.unique())

data['last_online'] = pd.to_datetime(data['last_online'], format='%Y-%m-%d-%H-%M')
most_recent_date = data['last_online'].max()
data['last_online'] = (most_recent_date - data['last_online']).dt.days

def convert_lastonline_to_cat(lo):
    if lo <= 7:
        return 'active'
    elif lo < 14:
        return 'semi-active'
    else:
        return 'not active'

data['presence'] = data['last_online'].apply(convert_lastonline_to_cat)

#location
#print(data.location.unique())

def get_main_location(location):
    return location.split(',')[1].strip()

data['location'] = data['location'].apply(get_main_location)

#offspring
#print(data.offspring.unique())

data['offspring'] = data['offspring'].str.replace("doesn&rsquo;t", "doesn't", regex=False)

has_kids_conditions = [
    data['offspring'].str.contains('has a kid|has kids', case = False),
    data['offspring'].str.contains('doesn\'t have kids', case = False)
]

has_kids_choices = ['yes', 'no']

data['has_kids'] = np.select(has_kids_conditions, has_kids_choices, default = 'unknown')

wants_kids_conditions = [
    data['offspring'].str.contains('doesn\'t want', case=False),
    data['offspring'].str.contains('might want', case=False),
    data['offspring'].str.contains('wants', case=False)
]
 
wants_kids_choices = ['no', 'maybe', 'yes']

data['wants_kids'] = np.select(wants_kids_conditions, wants_kids_choices, default='unknown')

#orientation - already good
#print(data.orientation.unique())

#pets
#print(data.pets.unique())

likes_dogs_conditions = [
    data['pets'].str.contains('likes dogs', case = False),
    data['pets'].str.contains('dislikes dogs', case = False)
]

likes_dogs_choices = ['yes', 'no']

data['likes_dogs'] = np.select(likes_dogs_conditions, likes_dogs_choices, default = 'unknown')

has_dogs_conditions = [
    data['pets'].str.contains('has dogs', case = False)
]

has_dogs_choices = ['yes']

data['has_dogs'] = np.select(has_dogs_conditions, has_dogs_choices, default = 'unknown')

likes_cats_conditions = [
    data['pets'].str.contains('likes cats', case = False),
    data['pets'].str.contains('dislikes cats', case = False)
]

likes_cats_choices = ['yes', 'no']

data['likes_cats'] = np.select(likes_cats_conditions, likes_cats_choices, default = 'unknown')

has_cats_conditions = [
    data['pets'].str.contains('has cats', case = False)
]

has_cats_choices = ['yes']

data['has_cats'] = np.select(has_cats_conditions, has_cats_choices, default = 'unknown')

# religion
#print(data.religion.unique())

religion_dedication_condition = [
    data['religion'].str.contains('very serious', case = False),
    data['religion'].str.contains('not too serious', case = False),
    data['religion'].str.contains('somewhat', case = False),
    data['religion'].str.contains('laughing', case = False)
]

religion_dedication_values = ['very dedicated', 'dedicated', 'partially dedicated', 'not dedicated']

data['religion_dedication'] = np.select(religion_dedication_condition, religion_dedication_values, default = 'unknown')

def set_religion(r):
    return r.split(' ', 2)[0]

data['religion'] = data.religion.apply(set_religion)

#sex - no changes needed
#print(data.sex.unique())

#sign 
#print(data.sign.unique())

data['sign'] = data['sign'].str.replace('doesn&rsquo;t', 'doesn\'t', regex=False)
data['sign'] = data['sign'].str.replace('it&rsquo;s', 'it\'s', regex=False)

sign_importance_conditions = [
    data['sign'].str.contains('doesn\'t matter', case = False),
    data['sign'].str.contains('fun', case = False),
    data['sign'].str.contains('it matters', case = False)
]

sign_importance_options = ['not important', 'fun', 'important']

data['sign_importance'] = np.select(sign_importance_conditions, sign_importance_options, default = 'unknown')

def set_sign(sign):
    return sign.split(' ', 2)[0]

data['sign'] = data['sign'].apply(set_sign)

#smokes
print(data.smokes.unique())

map_smoking = {
    'when drinking': 'sometimes',
    'trying to quit': 'yes'
}

data['smokes'] = data['smokes'].replace(map_smoking)

#speaks
print(data.speaks.unique())

unique_languages = []
for record in data.speaks.unique():
    languages = record.split(',')
    for lang in languages:
        l = lang.split(' ')[0]
        if l not in unique_languages:
            unique_languages.append(l)

print(unique_languages)

#TO FIX - THE RECORDS WITH SPACES ARE NOT BEING IDENTIFIED CORRECTLY
    


Index(['age', 'body_type', 'diet', 'drinks', 'drugs', 'education', 'essay0',
       'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7',
       'essay8', 'essay9', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'orientation', 'pets',
       'religion', 'sex', 'sign', 'smokes', 'speaks', 'status'],
      dtype='object')
['sometimes' 'no' 'unknown' 'when drinking' 'yes' 'trying to quit']
['english' 'english (fluently), spanish (poorly), french (poorly)'
 'english, french, c++' ...
 'english (fluently), hindi (poorly), french (poorly), tamil (okay), spanish (poorly)'
 'english (fluently), french (poorly), japanese (poorly), latin (poorly)'
 'english (fluently), french, farsi']
['english', '', 'unknown', 'afrikaans', 'french', 'portuguese']
