<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"></ul></div>

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt

In [2]:
#load data frame
df = pd.read_csv('profiles.csv')

In [None]:
#create labels to predict
df.loc[df['offspring'] == 'doesn&rsquo;t have kids, but wants them', 'offspring_label'] = 1
df.loc[df['offspring'] == 'wants kids', 'offspring_label'] = 1
df.loc[df['offspring'] == 'has a kid, and wants more', 'offspring_label'] = 1
df.loc[df['offspring'] == 'has kids, and wants more', 'offspring_label'] = 1


df.loc[df['offspring'] == 'doesn&rsquo;t want kids', 'offspring_label'] = 0
df.loc[df['offspring'] == 'doesn&rsquo;t have kids, and doesn&rsquo;t want any', 'offspring_label'] = 0
df.loc[df['offspring'] == 'has kids, but doesn&rsquo;t want more', 'offspring_label'] = 0
df.loc[df['offspring'] == 'has a kid, but doesn&rsquo;t want more', 'offspring_label'] = 0


In [None]:
df2 = pd.get_dummies(df[['drinks', 'drugs', 'sex', 'orientation', 'smokes', 'job', 'age', 'income']])


In [None]:
features_to_remove = ['body_type','diet','education','ethnicity','height', 'status', 'location','sign','pets','religion','speaks', 'last_online', 'essay0', 'essay1', 'essay2', 'essay3', 'essay4', 'essay5', 'essay6', 'essay7', 'essay8', 'essay9', 'drinks', 'drugs', 'sex', 'orientation', 'smokes', 'job', 'age', 'income', 'offspring']
df.drop(labels=features_to_remove, axis=1, inplace=True)

In [None]:
df = df.join(df2)

In [None]:
df.dropna(axis = 0, subset = ['offspring_label'], inplace = True) 


In [None]:
from sklearn.preprocessing import scale
#normalize data

labels = df['offspring_label']
data = df[['age',
 'income',
 'drinks_desperately',
 'drinks_not at all',
 'drinks_often',
 'drinks_rarely',
 'drinks_socially',
 'drinks_very often',
 'drugs_never',
 'drugs_often',
 'drugs_sometimes',
 'sex_f',
 'sex_m',
 'orientation_bisexual',
 'orientation_gay',
 'orientation_straight',
 'smokes_no',
 'smokes_sometimes',
 'smokes_trying to quit',
 'smokes_when drinking',
 'smokes_yes',
 'job_artistic / musical / writer',
 'job_banking / financial / real estate',
 'job_clerical / administrative',
 'job_computer / hardware / software',
 'job_construction / craftsmanship',
 'job_education / academia',
 'job_entertainment / media',
 'job_executive / management',
 'job_hospitality / travel',
 'job_law / legal services',
 'job_medicine / health',
 'job_military',
 'job_other',
 'job_political / government',
 'job_rather not say',
 'job_retired',
 'job_sales / marketing / biz dev',
 'job_science / tech / engineering',
 'job_student',
 'job_transportation',
 'job_unemployed']]
scaled_data = scale(data, axis = 0)


In [None]:
from sklearn.model_selection import train_test_split

train_data, test_data, train_labels, test_labels = train_test_split(scaled_data, labels, test_size = 0.2, random_state = 1)

In [None]:
#for runtime results

#from sklearn.neighbors import KNeighborsClassifier

#classifier = KNeighborsClassifier(n_neighbors = 42)
#classifier.fit(train_data, train_labels)
#%timeit classifier.score(test_data, test_labels)
#%timeit classifier.predict(test_data)


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

accuracy = []
recall = []
precision = []
f1 = []
for k in range(1, 200):
    classifier = KNeighborsClassifier(n_neighbors = k)
    classifier.fit(train_data, train_labels)
    predicted = classifier.predict(test_data)
    accuracy.append(accuracy_score(test_labels, predicted))
    recall.append(recall_score(test_labels, predicted))
    precision.append(precision_score(test_labels, predicted))
    f1.append(f1_score(test_labels, predicted))
    k_values = range(1,200)
    k_accuracy = list(zip(k_values, accuracy))
    k_recall = list(zip(k_values, recall))
    k_precision = list(zip(k_values, precision))
    k_f1 = list(zip(k_values, f1))
    best_accuracy = sorted(k_accuracy, key=lambda tup: tup[1], reverse = True)
    best_recall = sorted(k_recall, key=lambda tup: tup[1], reverse = True)
    best_precision = sorted(k_precision, key=lambda tup: tup[1], reverse = True)
    best_f1 = sorted(k_f1, key=lambda tup: tup[1], reverse = True)
print('Best Accuracy (k, score):', best_accuracy[0])
print('Best Recall (k, score):', best_recall[0])
print('Best Precision (k, score):', best_precision[0])
print('Best F1 (k, score):', best_f1[0])



In [None]:
plt.figure(figsize = (10,8))    
plt.plot(range(1,200), accuracy)
plt.plot(range(1,200), recall)
plt.plot(range(1,200), precision)
plt.plot(range(1,200), f1)
plt.xlabel('K-values')
plt.ylabel('Scores')
plt.legend(['Accuracy', 'Recall', 'Precision', 'F1'], loc='lower right')
plt.show()


