# Prepare data

In [1]:
import pandas as pd

adult_census = pd.read_csv("../datasets/adult-census-numeric.csv")

In [2]:
adult_census.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,class
0,41,0,0,92,<=50K
1,48,0,0,40,<=50K
2,60,0,0,25,<=50K
3,37,0,0,45,<=50K
4,73,3273,0,40,<=50K


In [3]:
target_name = "class"
target = adult_census[target_name]
target

0         <=50K
1         <=50K
2         <=50K
3         <=50K
4         <=50K
          ...  
39068     <=50K
39069     <=50K
39070      >50K
39071     <=50K
39072      >50K
Name: class, Length: 39073, dtype: object

In [4]:
data = adult_census.drop(columns=[target_name])
data.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week
0,41,0,0,92
1,48,0,0,40
2,60,0,0,25
3,37,0,0,45
4,73,3273,0,40


In [5]:
data.columns

Index(['age', 'capital-gain', 'capital-loss', 'hours-per-week'], dtype='object')

In [6]:
data.shape[0]

39073

In [7]:
data.shape[1]

4

# Build model

In [8]:
from sklearn.neighbors import KNeighborsClassifier

In [9]:
model = KNeighborsClassifier()

In [10]:
_ = model.fit(data, target)

![Predictor fit diagram](../figures/api_diagram-predictor.fit.svg)

In [12]:
target_predicted = model.predict(data)

![Predictor predict diagram](../figures/api_diagram-predictor.predict.svg)

In [13]:
target_predicted[:5]

array([' >50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K'], dtype=object)

In [14]:
target[:5]

0     <=50K
1     <=50K
2     <=50K
3     <=50K
4     <=50K
Name: class, dtype: object

In [15]:
target[:5] == target_predicted[:5]

0    False
1     True
2     True
3     True
4     True
Name: class, dtype: bool

In [16]:
(target == target_predicted).mean()

0.8242776341719346

In [17]:
adult_census_test = pd.read_csv("../datasets/adult-census-numeric-test.csv")

In [18]:
adult_census_test.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,class
0,20,0,0,35,<=50K
1,53,0,0,72,>50K
2,41,0,0,50,>50K
3,20,0,0,40,<=50K
4,25,0,0,40,<=50K


In [19]:
target_test = adult_census_test[target_name]
data_test = adult_census_test.drop(columns=[target_name])

In [20]:
data_test.shape[0]

9769

In [21]:
data_test.shape[1]

4

In [22]:
accuracy = model.score(data_test, target_test)
accuracy

0.8037670181185382

![Predictor score diagram](../figures/api_diagram-predictor.score.svg)

# Exercise solutions

In [23]:
from sklearn.neighbors import KNeighborsClassifier

KNeighborsClassifier?

[0;31mInit signature:[0m
[0mKNeighborsClassifier[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mn_neighbors[0m[0;34m=[0m[0;36m5[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mweights[0m[0;34m=[0m[0;34m'uniform'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0malgorithm[0m[0;34m=[0m[0;34m'auto'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mleaf_size[0m[0;34m=[0m[0;36m30[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mp[0m[0;34m=[0m[0;36m2[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmetric[0m[0;34m=[0m[0;34m'minkowski'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmetric_params[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mn_jobs[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Classifier implementing the k-nearest neighbors vote.

Read more in the :ref:`User Guide <classification>`.

Parameters
----------
n_

In [24]:
model = KNeighborsClassifier(n_neighbors=50)

In [25]:
import pandas as pd

adult_census = pd.read_csv("../datasets/adult-census-numeric.csv")
data = adult_census.drop(columns=["class"])
target = adult_census["class"]

In [26]:
model.fit(data, target)

In [27]:
first_data_values = data.iloc[:10]
first_predictions = model.predict(first_data_values)
first_predictions

array([' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K', ' <=50K',
       ' <=50K', ' >50K', ' <=50K', ' <=50K'], dtype=object)

In [28]:
first_target_values = target.iloc[:10]
first_target_values

0     <=50K
1     <=50K
2     <=50K
3     <=50K
4     <=50K
5     <=50K
6     <=50K
7      >50K
8     <=50K
9      >50K
Name: class, dtype: object

In [29]:
model.score(data, target)

0.8290379545978042

In [30]:
adult_census_test = pd.read_csv("../datasets/adult-census-numeric-test.csv")

data_test = adult_census_test.drop(columns="class")
target_test = adult_census_test["class"]

model.score(data_test, target_test)

0.8177909714402702