In [30]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

df = pd.read_csv('profiles.csv')

#print(df.describe())
print(df.info())
#df.head()

#EDA

#age column
#plt.hist(df['age'])
#plt.show() #based on the histogram, let's consider that everything above 80 years in wrong and drop those rows
df = df[df['age'] < 80]

#body type
# 3 groups with ~20% of the answers each. All the others have less than 10%. Will keep those 3 and aggregate the others

map_body_type = {
    'average': 'average',
    'fit': 'fit', 
    'athletic': 'athletic',  
    'thin': 'other', 
    'curvy': 'other',
    'a little extra': 'other',  
    'skinny': 'other', 
    'full figured': 'other',  
    'overweight': 'other',  
    'jacked': 'other', 
    'used up': 'unknown',  
    'rather not say': 'unknown'
}

df['body_type'] = df['body_type'].map(map_body_type).fillna('unknown')

#diet
map_diet = {
    # Group 1: Anything
    'mostly anything': 'anything',
    'anything': 'anything',
    'strictly anything': 'anything',
    
    # Group 2: Vegetarian/Vegan
    'vegetarian': 'vegetarian',
    'mostly vegetarian': 'vegetarian',
    'strictly vegetarian': 'vegetarian',
    'vegan': 'vegetarian',
    'mostly vegan': 'vegetarian',
    'strictly vegan': 'vegetarian',
    
    # Group 3: Other (includes religious-based + remaining 'other')
    'other': 'other',
    'mostly other': 'other',
    'strictly other': 'other',
    'kosher': 'other',
    'mostly kosher': 'other',
    'strictly kosher': 'other',
    'halal': 'other',
    'mostly halal': 'other',
    'strictly halal': 'other'
}

df['diet'] = df['diet'].map(map_diet).fillna('unknown')
# ~40% of unknowns, ~50% of anything. Might consider dropping this one

#drinks

map_drinks = {
    'socially': 'socially',
    'rarely': 'light',
    'not at all': 'light',
    'often': 'heavy',
    'very often': 'heavy',
    'desperately': 'heavy'
}
df['drinks'] = df['drinks'].map(map_drinks).fillna('unknown')

#socially category dominates with 70%

#drugs
map_drugs = {
    'never': 'no',
    'sometimes': 'yes',
    'often': 'yes',
    'unknown': 'unknown'
}

df['drugs'] = df['drugs'].map(map_drugs).fillna('unknown')

#education

#will divide into education level and is student

education_level_conditions = [
    df['education'].str.contains('college', case = False, na=False),
    df['education'].str.contains('masters', case = False, na=False),
    df['education'].str.contains('high school|ph.d|law school|med school', case = False, na=False) #these represent a low percentage <5%, so will group them
]

education_level_choices = ['college', 'masters', 'other']

df['education_level'] = np.select(education_level_conditions, education_level_choices, default = 'unknown')

#print(df['education_level'].value_counts(normalize = True))

is_student_conditions = [
    df['education'].str.contains('working on', case = False, na=False),
    df['education'].str.contains('graduated from|dropped out', case = False, na=False),
    df['education'].str.contains('space camp', case = False, na=False) #will consider all these unknown    
]

is_student_choices = ['yes', 'no', 'unknown']

df['is_student'] = np.select(is_student_conditions, is_student_choices, default = 'unknown')

df = df.drop('education', axis = 1)

ethnicity_conditions = [
    df['ethnicity'] == 'white',
    df['ethnicity'] == 'asian',
    df['ethnicity'].isna()
]

ethnicity_choices = ['white', 'asian', 'unknown']

df['ethnicity'] = np.select(ethnicity_conditions, ethnicity_choices, default = 'other')

#height
#plt.hist(df['height'])
#plt.show()
df = df[(df['height'] >= 50) & (df['height'] <= 90)] #this is a reasonable interval for height

#income
# 80% of unknowns, not a great column to predict other columns, but might be fun to remove the unknowns and predict this one

#plt.hist(df_income['income'])
#plt.show()

def classify_income(i):
    if i <= 0:
        return 'unknown'
    elif i <= 20000:
        return 'low'
    elif i <= 50000:
        return 'medium'
    else: 
        return 'high'

df['income'] = df['income'].apply(classify_income)

#print(df['income'].value_counts(normalize = True))

#job

job_map = {
    # STEM/Technical
    'science / tech / engineering': 'stem',
    'computer / hardware / software': 'stem',

    # Creative/Artistic
    'artistic / musical / writer': 'creative',
    'entertainment / media': 'creative',

    # Business/Professional
    'sales / marketing / biz dev': 'business',
    'executive / management': 'business',
    'banking / financial / real estate': 'business',
    'law / legal services': 'business',

    # Education/Health
    'education / academia': 'education_health',
    'medicine / health': 'education_health',

    # Manual Labor / Skilled Trades - Not enough percentage to create its own category
    'construction / craftsmanship': 'other',
    'transportation': 'other',
    'clerical / administrative': 'other',

    # Public/Government - Not enough percentage to create its own category
    'political / government': 'other',
    'military': 'other',

    # Other groups
    'student': 'student',
    'unemployed': 'other',
    'retired': 'other',
    'rather not say': 'unknown',
    'hospitality / travel': 'other',
    'other': 'other'
}

df['job'] = df['job'].map(job_map).fillna('unknown')

#last online

df['last_online'] = pd.to_datetime(df['last_online'], format='%Y-%m-%d-%H-%M')
most_recent_date = df['last_online'].max()
df['last_online'] = (most_recent_date - df['last_online']).dt.days

#binary distribution into active or not active. Could also make a semi active category (<=1, <=3, >3)
def categorize_last_online(v):
    if v <= 3:
        return 'active'
    else:
        return 'not active'

df['last_online'] = df['last_online'].apply(categorize_last_online)

#location

def get_city(location):
    city = location.split(',')[0]
    if city == 'san francisco': #50% of the states are san francisco, so will do this and 'other'
        return city
    else:
        return 'other'

df['city'] = df['location'].apply(get_city)

#offspring
df['offspring'] = df['offspring'].fillna('unknown')

#will drop the column because there are 60% of unknowns and spliting between has and wants kids made it worse.

df.drop('offspring', axis = 1)

#orientation

map_orientation = {
    'straight': 'straight',
    'gay': 'queer',
    'bisexual': 'queer'
}

df['orientation'] = df['orientation'].map(map_orientation).fillna('unknown')

#pets
df['pets'] = df['pets'].fillna('unknown')

def likes_dogs(pets):
    if isinstance(pets, str):
        pets = pets.lower().strip()
        if 'likes dogs' in pets or 'has dogs' in pets:
            return 'yes'
    return 'no'

def likes_cats(pets):
    if isinstance(pets, str):
        pets = pets.lower().strip()
        if 'likes cats' in pets or 'has cats' in pets:
            return 'yes'
    return 'no'
        

df['likes_dogs'] = df['pets'].apply(likes_dogs)
df['likes_cats'] = df['pets'].apply(likes_cats)

#religion
df['religion'] = df['religion'].fillna('unknown')

def get_religion(r):
    try:
        religion = r.split(' ')[0]
    except:
        print(r)
        return
    if religion == 'buddhism' or religion == 'hinduism' or religion == 'islam':
        return 'other'
    else:
        return religion

religion_importance_conditions = [
    df['religion'].str.contains('not too serious|laughing', case=False, na=False),
    df['religion'].str.contains('somewhat serious|very serious', case=False, na=False),
    df['religion'].str.contains('unknown', case=False, na=False)
]

religion_importance_options = ['no', 'yes', 'unknown']

df['religion_importance'] = np.select(religion_importance_conditions, religion_importance_options, default = 'unknown')
df['religion'] = df['religion'].apply(get_religion)   

#sex - good to go
#print(df['sex'].value_counts(normalize=True, dropna=False))

#sign
df['sign'] = df['sign'].fillna('unknown')
df['sign'] = df['sign'].str.replace('&rsquo;', '\'', regex=False)

def get_sign(sign):
    if isinstance(sign, str):
        return sign.split(' ')[0]
    else:
        return 'unknown'

sign_importance_conditions = [
    df['sign'].str.contains('fun', regex = False),
    df['sign'].str.contains('but it doesn\'t matter', regex = False),
    df['sign'].str.contains('and it matters a lot', regex = False)
]

sign_importance_options = ['fun', 'not important', 'important']

df['sign_importance'] = np.select(sign_importance_conditions, sign_importance_options, default = 'unknown')
df['sign'] = df['sign'].apply(get_sign)

#print(df['sign'].value_counts(normalize=True, dropna=False)) # 12 categories + unknown -> not sure if it will have predictive power. Might drop it
#print(df['sign_importance'].value_counts(normalize=True, dropna=False))  # only 1% considers important, might drop it

df.drop(['sign', 'sign_importance'], axis = 1)

#smokes
df['smokes'] = df['smokes'].fillna('unknown')

smokes_map = {
    'no': 'no',
    'unknown': 'unknown',
    'sometimes': 'yes',
    'when drinking': 'yes',
    'trying to quit': 'yes',
    'yes': 'yes'
}

df['smokes'] = df['smokes'].map(smokes_map)

print(df['smokes'].value_counts(normalize=True, dropna=False))

#speaks - has a list of comma separated values with proficiency between brackets
# will get the top 5 languages and save the proficiency level for each + add a column for languages count
df['speaks'] = df['speaks'].fillna('unknown')

def get_languages_count(l):
    count_commas = l.count(',')
    return (count_commas + 1)

df['language_count'] = df['speaks'].apply(get_languages_count)
print(df['language_count'].value_counts(normalize=True, dropna=False))

#TO DO -> get the 5 top languages and create columns with the level

#status
df['status'] = df['status'].fillna('unknown')
print(df['status'].value_counts(normalize=True, dropna=False))

status_map = {
    'single': 'available',
    'seeing someone': 'unavailable',
    'available': 'available',
    'married': 'unavailable',
    'unknown': 'unknown'
}

df['status'] = df['status'].map(status_map)

print(df['status'].value_counts(normalize=True, dropna=False))


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59946 entries, 0 to 59945
Data columns (total 31 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          59946 non-null  int64  
 1   body_type    54650 non-null  object 
 2   diet         35551 non-null  object 
 3   drinks       56961 non-null  object 
 4   drugs        45866 non-null  object 
 5   education    53318 non-null  object 
 6   essay0       54458 non-null  object 
 7   essay1       52374 non-null  object 
 8   essay2       50308 non-null  object 
 9   essay3       48470 non-null  object 
 10  essay4       49409 non-null  object 
 11  essay5       49096 non-null  object 
 12  essay6       46175 non-null  object 
 13  essay7       47495 non-null  object 
 14  essay8       40721 non-null  object 
 15  essay9       47343 non-null  object 
 16  ethnicity    54266 non-null  object 
 17  height       59943 non-null  float64
 18  income       59946 non-null  int64  
 19  job 