# Imports

In [1]:
from sklearn.metrics import f1_score
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Train

In [2]:
train_set = pd.read_csv('train/train_encoded.csv')

In [19]:
X, y = train_set.iloc[:,:-1],train_set.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# KNN

In [20]:
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)
preds = model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F1-Mean Score: %f" % (f1))

F1-Mean Score: 0.726680


In [21]:
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)
preds = model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F1-Mean Score: %f" % (f1))

F1-Mean Score: 0.753385


In [22]:
model = KNeighborsClassifier(n_neighbors=1)
model.fit(X_train, y_train)
preds = model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F1-Mean Score: %f" % (f1))

F1-Mean Score: 0.739210


In [23]:
model = KNeighborsClassifier(n_neighbors=3, weights='distance') #metric='euclidean'
model.fit(X_train, y_train)
preds = model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F1-Mean Score: %f" % (f1))

F1-Mean Score: 0.730225


In [24]:
model = KNeighborsClassifier(n_neighbors=3, weights='distance', metric='euclidean')
model.fit(X_train, y_train)
preds = model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F1-Mean Score: %f" % (f1))

F1-Mean Score: 0.730225


In [25]:
model = KNeighborsClassifier(n_neighbors=3, metric='euclidean')
model.fit(X_train, y_train)
preds = model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F1-Mean Score: %f" % (f1))

F1-Mean Score: 0.726680


In [26]:
model = KNeighborsClassifier(n_neighbors=3, metric='manhattan')
model.fit(X_train, y_train)
preds = model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F1-Mean Score: %f" % (f1))

F1-Mean Score: 0.746232


In [27]:
model = KNeighborsClassifier(n_neighbors=17, metric='manhattan')
model.fit(X_train, y_train)
preds = model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F1-Mean Score: %f" % (f1))

F1-Mean Score: 0.762105


In [28]:
model = KNeighborsClassifier(n_neighbors=17, weights='distance', metric='manhattan')
model.fit(X_train, y_train)
preds = model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F1-Mean Score: %f" % (f1))

F1-Mean Score: 0.771071


In [29]:
model = KNeighborsClassifier(n_neighbors=17, algorithm='brute')
model.fit(X_train, y_train)
preds = model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F1-Mean Score: %f" % (f1))

F1-Mean Score: 0.740172


# Grid Search

In [14]:
n_neighbors = []
for i in range(1, 200, 2):
    n_neighbors.append(i)

params = {
    'n_neighbors': n_neighbors,
    'weights': ['distance'],
    'metric': ['manhattan']
}

In [15]:
grid_model = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=params)
grid_model.fit(X_train, y_train)

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'metric': ['manhattan'],
                         'n_neighbors': [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21,
                                         23, 25, 27, 29, 31, 33, 35, 37, 39, 41,
                                         43, 45, 47, 49, 51, 53, 55, 57, 59, ...],
                         'weights': ['distance']})

In [16]:
grid_model.best_params_

{'metric': 'manhattan', 'n_neighbors': 131, 'weights': 'distance'}

In [17]:
preds = grid_model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F-Mean Score: %f" % (f1))

F-Mean Score: 0.773569


In [18]:
model = KNeighborsClassifier(n_neighbors=49, metric='manhattan', weights='distance')
model.fit(X_train, y_train)
preds = model.predict(X_test)
f1 = np.sqrt(f1_score(y_test, preds))
print("F1-Mean Score: %f" % (f1))

F1-Mean Score: 0.762900


Best Score w/ 0.1: 78,2

49, manhattan, distance

# Test

In [40]:
test_set = pd.read_csv('test/test_encoded.csv')

In [41]:
col_id = test_set['id']
test_set.drop('id', axis=1, inplace=True)

In [42]:
preds = model.predict(test_set)
preds

array([0, 0, 0, ..., 0, 0, 1])

In [43]:
test_set['target'] = preds
test_set['id'] = col_id
test_set[['id', 'target']]

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0
...,...,...
3258,10861,0
3259,10865,0
3260,10868,0
3261,10874,0


### Resultado

In [44]:
test_set[['id', 'target']].to_csv('result.csv', header=True, index=False)