In [9]:
import pandas as pd
import numpy as np

data = pd.read_csv('profiles.csv')

not_essay_cols = [col for col in data.columns if 'essay' not in col]
data = data[not_essay_cols]

print(data.head())
print(data.describe())
print(data.info())
print(data.shape)
#print(data.duplicated())
#data.isnull().mean().sort_values(ascending=False)  # for percentage
#data.isnull().sum().sort_values(ascending=False)



   age       body_type               diet    drinks      drugs  \
0   22  a little extra  strictly anything  socially      never   
1   35         average       mostly other     often  sometimes   
2   38            thin           anything  socially        NaN   
3   23            thin         vegetarian  socially        NaN   
4   29        athletic                NaN  socially      never   

                           education            ethnicity  height  income  \
0      working on college/university         asian, white    75.0      -1   
1              working on space camp                white    70.0   80000   
2     graduated from masters program                  NaN    68.0      -1   
3      working on college/university                white    71.0   20000   
4  graduated from college/university  asian, black, other    66.0      -1   

                           job  ...                         location  \
0               transportation  ...  south san francisco, california

In [12]:
import pandas as pd
import numpy as np

data = pd.read_csv('profiles.csv')

#EDA

#fill null essays with empty string
essay_cols = [col for col in data.columns if 'essay' in col]
data[essay_cols] = data[essay_cols].fillna('')

#drop 3 records without height (was thinking about filling those values with the mean, but since there are only 3 rows, decided to drop them)
data = data.dropna(subset=['height'])

#fill categorical columns with unkown
cat_cols = data.columns
print(cat_cols)
data[cat_cols] = data[cat_cols].fillna('unknown')

#print(data.isnull().sum().sort_values(ascending=False))
#print(data.head())

#group body types - Group body types into slim, average, fit, plus or unknown
def group_body_types(bt):
    if bt in ['thin', 'skinny']:
        return 'slim'
    elif bt == 'average':
        return 'average'
    elif bt in ['athletic', 'fit', 'jacked']: 
        return 'fit'
    elif bt in ['a little extra', 'curvy', 'full figured', 'overweight']: 
        return 'plus'
    else:
        return 'unknown'

data.body_type = data.body_type.apply(group_body_types)

#Diet - for diet there are 2 pieces of info in this column, so I will divide into diet type and diet strictness
def fill_diet_type(diet):
    parts = diet.split(' ')
    if len(parts) == 1:
        return parts[0]
    else:
        return parts[1]
    
data['diet_type'] = data['diet'].apply(fill_diet_type)

def fill_diet_strictness(diet):
    parts = diet.split(' ')
    if len(parts) == 1 and parts[0] == 'unknown':
        return 'unknown'
    elif len(parts) == 1: 
        return 'neutral'
    else:
        return parts[0]

data['diet_strictness'] = data['diet'].apply(fill_diet_strictness)

strict_dict = {
    'strictly': 'strict',
    'mostly': 'flexible',
    'neutral': 'standard',
    'unknown': 'unknown'
}

data['diet_strictness'] = data['diet_strictness'].map(strict_dict)

#drinks - almost perfect, just changed some labels and grouped 2 fields
drinks_dict = {
    'unknown': 'unknown',
    'not at all': 'non-drinker',
    'rarely': 'light',
    'socially': 'moderate',
    'often': 'heavy',
    'very often': 'heavy',
    'desperately': 'very heavy',    
}

data.drinks = data.drinks.map(drinks_dict)

#drugs - good to go
#education

#print(data.education.unique())

def split_education(edu): 
    if edu is None or pd.isna(edu) or edu == 'unknown' or edu == '':
        return pd.Series(['unknown', 'unknown'])
    parts = edu.split(' ', 2)
    if len(parts) == 1:
        return pd.Series(['graduated from', parts[0]])
    if len(parts) == 2:
        return pd.Series(['graduated from', parts[0] + ' ' + parts[1]])
    if len(parts) == 3:
        status = parts[0] + ' ' + parts[1]
        level = parts[2]
        return pd.Series([status, level])

data[['education_status', 'education_level']] = data['education'].apply(split_education)

map_edu = {
    'college/university': 'college',
    'space camp': 'unknown',
    'masters program': 'masters',
    'two-year college': 'college',
    'unknown': 'unknown',
    'high school': 'high school',
    'of space camp': 'unknown',
    'ph.d program': 'phd',
    'law school': 'law school',
    'med school': 'med school',
    'of college/university': 'college',
    'of high school': 'high school',
    'of ph.d program': 'phd',
    'of two-year college': 'college',
    'of med school': 'med school',
    'of masters program': 'masters',
    'of law school': 'law school'
}

edu_status_map = {
    'working on': 'in progress',
    'graduated from': 'finished',
    'unknown': 'unknown',
    'dropped out': 'dropped out'
}

data['education_level'] = data['education_level'].map(map_edu)
data['education_status'] = data['education_status'].map(edu_status_map)

#print(data.education_status.unique())
#print(data.education_level.unique())

#ethnicity
#print(data.ethnicity.unique())

def get_primary_race(race):
    if race == '' or race == 'unknown':
        return 'unknown'
    else:
        return race.split(',')[0].strip().lower()

data['ethnicity'] = data['ethnicity'].apply(get_primary_race)

#print(data.ethnicity.unique())

#job
#print(data.job.unique())

career_map = {
    'science / tech / engineering': 'STEM',
    'computer / hardware / software': 'STEM',
    
    'medicine / health': 'Healthcare',
    
    'education / academia': 'Education',
    
    'banking / financial / real estate': 'Business',
    'sales / marketing / biz dev': 'Business',
    'executive / management': 'Business',
    
    'artistic / musical / writer': 'Creative',
    'entertainment / media': 'Creative',
    
    'hospitality / travel': 'Service',
    'clerical / administrative': 'Service',
    'construction / craftsmanship': 'Service',
    
    'political / government': 'Government / Law',
    'law / legal services': 'Government / Law',
    'military': 'Government / Law',
    
    'transportation': 'Transportation',
    
    'student': 'Student',
    'unemployed': 'Unemployed',
    'retired': 'Retired',
    
    'rather not say': 'Other',
    'other': 'Other',
    'unknown': 'Other'
}

data['job'] = data['job'].map(career_map).fillna('Other')

#print(data.job.unique())

#last_online
#print(data.last_online.unique())

data['last_online'] = pd.to_datetime(data['last_online'], format='%Y-%m-%d-%H-%M')
most_recent_date = data['last_online'].max()
data['last_online'] = (most_recent_date - data['last_online']).dt.days

def convert_lastonline_to_cat(lo):
    if lo <= 7:
        return 'active'
    elif lo < 14:
        return 'semi-active'
    else:
        return 'not active'

data['presence'] = data['last_online'].apply(convert_lastonline_to_cat)

#location
#print(data.location.unique())

def get_main_location(location):
    return location.split(',')[1].strip()

data['location'] = data['location'].apply(get_main_location)

#offspring
#print(data.offspring.unique())

data['offspring'] = data['offspring'].str.replace("doesn&rsquo;t", "doesn't", regex=False)

has_kids_conditions = [
    data['offspring'].str.contains('has a kid|has kids', case = False),
    data['offspring'].str.contains('doesn\'t have kids', case = False)
]

has_kids_choices = ['yes', 'no']

data['has_kids'] = np.select(has_kids_conditions, has_kids_choices, default = 'unknown')

wants_kids_conditions = [
    data['offspring'].str.contains('doesn\'t want', case=False),
    data['offspring'].str.contains('might want', case=False),
    data['offspring'].str.contains('wants', case=False)
]
 
wants_kids_choices = ['no', 'maybe', 'yes']

data['wants_kids'] = np.select(wants_kids_conditions, wants_kids_choices, default='unknown')

#orientation - already good
#print(data.orientation.unique())

#pets
#print(data.pets.unique())

likes_dogs_conditions = [
    data['pets'].str.contains('likes dogs', case = False),
    data['pets'].str.contains('dislikes dogs', case = False)
]

likes_dogs_choices = ['yes', 'no']

data['likes_dogs'] = np.select(likes_dogs_conditions, likes_dogs_choices, default = 'unknown')

has_dogs_conditions = [
    data['pets'].str.contains('has dogs', case = False)
]

has_dogs_choices = ['yes']

data['has_dogs'] = np.select(has_dogs_conditions, has_dogs_choices, default = 'unknown')

likes_cats_conditions = [
    data['pets'].str.contains('likes cats', case = False),
    data['pets'].str.contains('dislikes cats', case = False)
]

likes_cats_choices = ['yes', 'no']

data['likes_cats'] = np.select(likes_cats_conditions, likes_cats_choices, default = 'unknown')

has_cats_conditions = [
    data['pets'].str.contains('has cats', case = False)
]

has_cats_choices = ['yes']

data['has_cats'] = np.select(has_cats_conditions, has_cats_choices, default = 'unknown')

# religion
#print(data.religion.unique())

religion_dedication_condition = [
    data['religion'].str.contains('very serious', case = False),
    data['religion'].str.contains('not too serious', case = False),
    data['religion'].str.contains('somewhat', case = False),
    data['religion'].str.contains('laughing', case = False)
]

religion_dedication_values = ['very dedicated', 'dedicated', 'partially dedicated', 'not dedicated']

data['religion_dedication'] = np.select(religion_dedication_condition, religion_dedication_values, default = 'unknown')

def set_religion(r):
    return r.split(' ', 2)[0]

data['religion'] = data.religion.apply(set_religion)

#sex - no changes needed
#print(data.sex.unique())

#sign 
#print(data.sign.unique())

data['sign'] = data['sign'].str.replace('doesn&rsquo;t', 'doesn\'t', regex=False)
data['sign'] = data['sign'].str.replace('it&rsquo;s', 'it\'s', regex=False)

sign_importance_conditions = [
    data['sign'].str.contains('doesn\'t matter', case = False),
    data['sign'].str.contains('fun', case = False),
    data['sign'].str.contains('it matters', case = False)
]

sign_importance_options = ['not important', 'fun', 'important']

data['sign_importance'] = np.select(sign_importance_conditions, sign_importance_options, default = 'unknown')

def set_sign(sign):
    return sign.split(' ', 2)[0]

data['sign'] = data['sign'].apply(set_sign)

#smokes
#print(data.smokes.unique())

map_smoking = {
    'when drinking': 'sometimes',
    'trying to quit': 'yes'
}

data['smokes'] = data['smokes'].replace(map_smoking)

#speaks
#print(data.speaks.unique())

"""
language_counts = {}
for record in data.speaks.unique():
    languages = record.split(',')
    for lang in languages:
        l = lang.strip().split(' ')[0]
        if l in language_counts:
            language_counts[l] += 1
        else:
            language_counts[l] = 1

sorted_langs = dict(sorted(language_counts.items(), key=lambda item: item[1], reverse=True))
"""

#decided for now to only store the number of languages spoken

def get_language_count(record):
    language_count = 0
    languages = record.split(',')
    for lang in languages:
        l = lang.strip().split(' ')[0]
        if l not in ['c++', 'lisp', 'ancient', 'unknown', 'sign']:
            language_count += 1
    return language_count

data['languages_count'] = data['speaks'].apply(get_language_count)
#print(data[['languages_count', 'speaks']].head())

#Next step: add 1 column for each of the top 10 and add the level for each user

#status
#print(data.status.unique())

status_map = {
    'single': 'single',
    'available': 'single',
    'seeing someone': 'committed',
    'married': 'committed',
    'unknown': 'unknown'
}

data['status'] = data['status'].map(status_map)
#print(data.status.unique())

#numerical fields
#print(data.describe())

"""
import matplotlib.pyplot as plt
data['height'].hist(bins=30)
plt.xlabel('Height (inches)')
plt.ylabel('Count')
plt.show()


data['age'].hist(bins=30)
plt.xlabel('Age')
plt.ylabel('Count')
plt.show()
"""

data = data[(data['age'] >= 18) & (data['age'] <= 90)] # reasonable interval for age
data = data[(data['height'] >= 54) & (data['height'] <= 84)] # reasonable interval for height

data = data.drop(columns=['income']) #around 80% of values are missing, will just drop the column

# still need to drop: diet, education, last_online, offspring, pets, speaks -> other columns were created based on this information

data = data.drop(columns=['diet', 'education', 'last_online', 'offspring', 'pets', 'speaks'])

#now drop columns where unknowns are more than 50%

cat_cols = data.select_dtypes(include=['object','category']).columns.tolist()
threshold = 0.5
high_unknown_cols = []
for col in cat_cols:
    p = (data[col] == 'unknown').mean()
    if p > threshold:
        high_unknown_cols.append(col)

#print(high_unknown_cols)


# drop dominant columns
def drop_dominant_columns(df, threshold=0.95):
    drop_cols = []
    for col in df.select_dtypes(include=['object', 'category']).columns:
        top_freq = df[col].value_counts(normalize=True).iloc[0]
        if top_freq > threshold:
            drop_cols.append(col)
    print('will drop ', drop_cols)
    return df.drop(columns=drop_cols), drop_cols

data, dropped = drop_dominant_columns(data, threshold=0.95)
print(f"Dropped columns due to dominance: {dropped}")

data = data.drop(columns=high_unknown_cols)

not_essay_cols = [col for col in data.columns if 'essay' not in col]
print(data[not_essay_cols].head())
print(data[not_essay_cols].info())

Index(['age', 'body_type', 'diet', 'drinks', 'drugs', 'education', 'essay0',
       'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7',
       'essay8', 'essay9', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'orientation', 'pets',
       'religion', 'sex', 'sign', 'smokes', 'speaks', 'status'],
      dtype='object')
will drop  ['location', 'status']
Dropped columns due to dominance: ['location', 'status']
   age body_type    drinks      drugs ethnicity  height             job  \
0   22      plus  moderate      never     asian    75.0  Transportation   
1   35   average     heavy  sometimes     white    70.0         Service   
2   38      slim  moderate    unknown   unknown    68.0           Other   
3   23      slim  moderate    unknown     white    71.0         Student   
4   29       fit  moderate      never     asian    66.0        Creative   

  orientation     religion sex      sign     smokes   diet_type  \
0    straight

In [2]:
#Helper module with funtions

def downsample(df, col):
    min_count = df[col].value_counts().min()
    balanced_df = pd.concat([
        df[df[col] == category].sample(n=min_count, random_state=1)
        for category in df[col].unique()
    ])
    return balanced_df

from sklearn.utils import resample

def upsample(df, target_col):
    # Split into majority and minority classes
    classes = df[target_col].value_counts().index
    max_count = df[target_col].value_counts().max()

    df_upsampled = []

    for cls in classes:
        df_class = df[df[target_col] == cls]
        df_class_upsampled = resample(
            df_class,
            replace=True,               # Sample with replacement
            n_samples=max_count,        # Match majority class
            random_state=42
        )
        df_upsampled.append(df_class_upsampled)

    return pd.concat(df_upsampled).sample(frac=1, random_state=42).reset_index(drop=True)  # shuffle

In [3]:
#check class imbalance

cols = data.select_dtypes(include=['object','category']).columns.tolist()
cols = [col for col in cols if 'essay' not in col]

for column in cols:
    print(data[column].value_counts(normalize = True)) #check if the classes are balanced

body_type
fit        0.416228
average    0.244609
plus       0.133604
slim       0.108333
unknown    0.097226
Name: proportion, dtype: float64
drinks
moderate       0.697383
light          0.099414
heavy          0.093902
non-drinker    0.054467
unknown        0.049590
very heavy     0.005245
Name: proportion, dtype: float64
drugs
never        0.629503
unknown      0.234838
sometimes    0.128977
often        0.006681
Name: proportion, dtype: float64
ethnicity
white               0.558735
asian               0.136828
unknown             0.094603
hispanic / latin    0.072990
black               0.051244
other               0.028328
indian              0.019976
middle eastern      0.013529
pacific islander    0.011942
native american     0.011825
Name: proportion, dtype: float64
job
Other               0.270482
STEM                0.159476
Business            0.150741
Creative            0.111607
Student             0.081392
Healthcare          0.061432
Education           0.058609
Servic

In [14]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import VarianceThreshold, RFE, SelectKBest, chi2
import seaborn as sns
from xgboost import XGBClassifier
from collections import Counter
#from imblearn.over_sampling import SMOTE

df = data[not_essay_cols].copy() #don't want to take into account essays for this

col_to_predict = 'income'

if col_to_predict == 'presence':
    map_presence = {
        'active': 'active',
        'not active': 'not active',
        'semi-active': 'not active'
    }
    df['presence'] = df['presence'].map(map_presence)

if col_to_predict == 'body_type':
    map_body = {
        'fit': 'fit',
        'average': 'not fit',
        'plus': 'not fit',
        'slim': 'not fit',
        'unknown': 'unknown'    
    }
    df['body_type'] = df['body_type'].map(map_body)


#remove rows where the target variable is unknown
df = df[df[col_to_predict] != 'unknown']

#create vars with categorical columns and numerical columns
cat_cols = df.select_dtypes(include=['object','category']).columns.tolist()
cat_cols = [col for col in cat_cols if col != col_to_predict]
num_cols = df.select_dtypes(include=['float64', 'int64']).columns.tolist()
num_cols = [col for col in num_cols if col != col_to_predict]

df_dummies = pd.get_dummies(df[cat_cols], drop_first = True)

x = pd.concat([df[num_cols], df_dummies], axis = 1)
y = df[col_to_predict]

print(df[col_to_predict].value_counts(normalize = True)) #check if the classes are balanced

le = LabelEncoder()
y = le.fit_transform(y)
print(le.classes_)

# ------------ Chi2 test ------------ #
# Chi2 requires all values to be non-negative
X_chi = x.copy()
X_chi[X_chi < 0] = 0  # Only necessary if you have negative values

selector = SelectKBest(score_func=chi2, k=10)  # Keep top 10 features
X_chi_selected = selector.fit_transform(X_chi, y)
selected_features = X_chi.columns[selector.get_support()]
print(f"Selected features from Chi2: {list(selected_features)}")

x = x[selected_features]

cat_cols = x.select_dtypes(include=['object','category']).columns.tolist()
num_cols = x.select_dtypes(include=['float64', 'int64']).columns.tolist()

# ------------ End of Chi2 test ------------ #

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=24, stratify=y)

scaler = StandardScaler()
x_train[num_cols] = scaler.fit_transform(x_train[num_cols])
x_test[num_cols] = scaler.transform(x_test[num_cols]) #never use fit transform on test data, because it will learn from train data

print(pd.Series(y_train).value_counts(normalize=True))

# ----- XGB Classifier ----- #
counter = Counter(y_train)
majority = max(counter.values())
minority = min(counter.values())
imbalance_ratio = majority / minority

model = XGBClassifier(
    eval_metric='logloss',
    scale_pos_weight=imbalance_ratio
)

param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

gs = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='f1_macro', verbose=1, n_jobs=-1)    

gs.fit(x_train, y_train)
y_pred = gs.predict(x_test)
print('Classification report for XGB Classifier:')
print(classification_report(y_test, y_pred))

# --------- Logistic Regression ------------- #

model = LogisticRegression(class_weight='balanced', solver = 'lbfgs', max_iter = 1000)
model.fit(x_train, y_train)
y_pred_model = model.predict(x_test)
model_score = classification_report(y_test, y_pred_model)
print('Classification report for LR:')
print(model_score)

## THROWS AN ERROR PREDICTING NUMERICAL COLUMNS: ---> 36 df = df[df[col_to_predict] != 'unknown']

KeyError: 'income'

In [6]:
#Stuff not used

'''
from sklearn.svm import SVC

model = SVC(class_weight='balanced')
print('1')
model.fit(x_train, y_train)
print('2')
y_pred = model.predict(x_test)
model_score = classification_report(y_test, y_pred)
print('Classification report SVC:')
print(model_score)

#DO FEATURE SELECTION
#DO HYPERPARAMETER TUNING
#DO CLASS MERGE (IF NEEDED)
'''


'''
model = LogisticRegression(class_weight='balanced', solver = 'lbfgs', max_iter = 1000)
model.fit(x_train, y_train)
y_pred_model = model.predict(x_test)
model_score = classification_report(y_test, y_pred_model)
print('Classification report for LR:')
print(model_score)
'''

'''
smote = SMOTE(random_state=24)
x_resampled, y_resampled = smote.fit_resample(x_train, y_train) #use SMOTE to make sure the classes are balanced
'''

'''
selector = VarianceThreshold(threshold=0.01)
x = selector.fit_transform(x)
'''

# -------- Apply downsampling to make sure the classes are balanced --------------
'''
# Convert y_train back to Series with the correct index
y_train_series = pd.Series(y_train, index=x_train.index, name=col_to_predict)

# Concatenate features and target
train_data = pd.concat([x_train, y_train_series], axis=1)

train_balanced = downsample(train_data, col_to_predict)
x_train = train_balanced.drop(columns = col_to_predict)
y_train = train_balanced[col_to_predict]
'''
# -------- Downsampling ends here --------------

# ----- Random forests ----- #
'''
rf = RandomForestClassifier(class_weight='balanced')
rf.fit(x_train, y_train)
y_pred_rf = rf.predict(x_test)
rf_score = classification_report(y_test, y_pred_rf)
print('Classification report for RF:')
print(rf_score)

importances = pd.Series(rf.feature_importances_, index=x_train.columns)
print(importances.sort_values(ascending=False).head(10))
'''

"\nrf = RandomForestClassifier(class_weight='balanced')\nrf.fit(x_train, y_train)\ny_pred_rf = rf.predict(x_test)\nrf_score = classification_report(y_test, y_pred_rf)\nprint('Classification report for RF:')\nprint(rf_score)\n\nimportances = pd.Series(rf.feature_importances_, index=x_train.columns)\nprint(importances.sort_values(ascending=False).head(10))\n"