In [1]:
import os  
import pandas as pd  
import numpy as np  
import matplotlib.pyplot as plt  
import seaborn as sns  

from sklearn.neighbors import KNeighborsClassifier  
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score


# basado muy fuertemente en este artículo:
#     https://lukesingham.com/whos-going-to-leave-next/

In [2]:
df1 = pd.read_csv('data/features_basicas.csv', low_memory=False, index_col='person')
labels = pd.read_csv('data/labels_training_set.csv', low_memory=False, index_col='person')

df_unidos = df1
df_test = df_unidos.join(labels, how='inner', on='person')

In [3]:
features = df_test.columns.tolist()
features.remove('label')
print(features)

['cant_conversions', 'cant_checkouts', 'cant_viewed_product', 'cant_searched_product', 'total_sesiones', 'promedio_eventos_por_sesion']


## Entranando el KNN

In [4]:
knn = KNeighborsClassifier(n_neighbors=3, weights='distance', n_jobs=-1)

x_train, x_test, y_train, y_test = train_test_split(df_test[features], df_test['label'], test_size=0.20)

In [5]:
knn.fit(x_train, y_train)
knn.score(x_test, y_test)

0.9454030388874581

In [6]:
y_pred = knn.predict(x_test)
y_proba = knn.predict_proba(x_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          0       0.95      0.99      0.97      3705
          1       0.03      0.01      0.01       178

avg / total       0.91      0.95      0.93      3883



In [7]:
cm = confusion_matrix(y_test, y_pred)
print()
print('true 0s: {}'.format(cm[0][0] + cm[0][1]))
print('pred 0s: {}'.format(cm[0][0] + cm[1][0]))
print('true 1s: {}'.format(cm[1][0] + cm[1][1]))
print('pred 1s: {}'.format(cm[0][1] + cm[1][1]))


true 0s: 3705
pred 0s: 3847
true 1s: 178
pred 1s: 36


In [8]:
# y_proba_df = pd.DataFrame(y_proba)
# y_proba_df.columns = ['not_buy', 'buy']
# y_proba_df.sort_values(ascending=False, by='buy').head()

### Levanto todo lo necesario para hacer el submit

In [9]:
df_submit = pd.read_csv('data/trocafone_kaggle_test.csv', low_memory=False, index_col='person')
df_events = df_submit.join(df_unidos, how='inner')

In [10]:
kaggle_pred = knn.predict_proba(df_events)
kaggle_proba = [x[1] for x in kaggle_pred]
series = pd.Series(kaggle_proba)
df_submit['label'] = series.values

In [11]:
df_submit.to_csv('submit.csv')