In [1]:
import pandas as pd

print('Loading data ...')
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
print('Done!')

df_train.head()

Loading data ...
Done!


Unnamed: 0,row_id,x,y,accuracy,time,place_id
0,0,0.7941,9.0809,54,470702,8523065625
1,1,5.9567,4.7968,13,186555,1757726713
2,2,8.3078,7.0407,74,322648,1137537235
3,3,7.3665,2.5165,65,704587,6567393236
4,4,4.0961,1.1307,31,472130,7440663949


In [2]:
import numpy as np

n_cell_x = 20
n_cell_y = 20

print('Preparing data ...')
def prepare_data(df, n_cell_x, n_cell_y):
    
    size_x = 10.0/n_cell_x
    size_y = 10.0/n_cell_y
    
    eps = 0.0001
    xs = np.where(df.x.values < eps, 0, df.x.values - eps)
    ys = np.where(df.y.values < eps, 0, df.y.values - eps)
    pos_x = (xs / size_x).astype(np.int)
    pos_y = (ys / size_y).astype(np.int)
    df['grid_cell'] = pos_y * n_cell_x + pos_x
    df['hour'] = ( df['time']%(60*24) )//60
    df['day'] = (df['time']%(60*24*7))//(60*24)
    df['month'] = (df['time']%(60*24*30*12))//(60*24*30)
    accuracy = np.where(df.accuracy.values > 1000, 1000, df.accuracy.values)
    df['accuracy'] = (accuracy/1000).astype(np.float)
    df.drop(['time'], axis = 1, inplace = True)
    return df

df_train = prepare_data(df_train, n_cell_x, n_cell_y)
df_test = prepare_data(df_test, n_cell_x, n_cell_y)
print('Done!')
df_train.head()

Preparing data ...
Done!


Unnamed: 0,row_id,x,y,accuracy,place_id,grid_cell,hour,day,month
0,0,0.7941,9.0809,0.054,8523065625,361,21,4,10
1,1,5.9567,4.7968,0.013,1757726713,191,13,3,4
2,2,8.3078,7.0407,0.074,1137537235,296,1,0,7
3,3,7.3665,2.5165,0.065,6567393236,114,7,6,4
4,4,4.0961,1.1307,0.031,7440663949,48,20,5,10


In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier

def process_one_cell(df_train, df_test, grid_id, th):
    
    """   
    Classification inside one grid cell.
    """   
    
    #Working on df_train
    df_cell_train = df_train.loc[df_train.grid_cell == grid_id]
    place_counts = df_cell_train.place_id.value_counts()
    mask = (place_counts[df_cell_train.place_id.values] >= th).values
    df_cell_train = df_cell_train.loc[mask]

    #Working on df_test
    df_cell_test = df_test.loc[df_test.grid_cell == grid_id]
    row_ids = df_cell_test.index
    
    #Preparing data
    le = LabelEncoder()
    y = le.fit_transform(df_cell_train.place_id.values)
    X = df_cell_train.drop(['place_id', 'grid_cell'], axis=1)
    X_test = df_cell_test.drop(['grid_cell'], axis = 1)
    
    #Applying the classifier
    clf = KNeighborsClassifier(n_neighbors=25, weights='distance', 
                               metric='manhattan')
    clf.fit(X, y)
    y_pred = clf.predict_proba(X_test)
    pred_labels = le.inverse_transform(np.argsort(y_pred, axis=1)[:,::-1][:,:3])    
    return pred_labels, row_ids


def process_grid(df_train, df_test, th, n_cells):
    
    preds = np.zeros((df_test.shape[0],3), dtype = int)
    for g_id in range(n_cells):
        if g_id%50 == 0:
            print('iteration: %s' %(g_id))
        
        pred_labels, row_ids = process_one_cell(df_train, df_test, g_id, th)
        
        preds[row_ids] = pred_labels
    
    print(preds[:3])
    
    print('Generating submission file ...')
    
    #Auxiliary dataframe with the 3 best predictions for each sample
    df_aux = pd.DataFrame(preds, dtype = str, columns = ['l1','l2','l3'])
    
    #Concatenating the 3 predictions for each sample
    ds_sub = df_aux.l1.str.cat([df_aux.l2, df_aux.l3], sep=' ')
    
    #Writting to csv
    ds_sub.name = 'place_id'
    ds_sub.to_csv('sub_knn.csv', index=True, header=True, index_label='row_id')   
    
    print('Done!')

#Solving classification problems inside each grid cell
th = 10 #Keeping place_ids with more than th samples.   
process_grid(df_train, df_test, th, n_cell_x*n_cell_y)

iteration: 0
iteration: 50
iteration: 100
iteration: 150
iteration: 200
iteration: 250
iteration: 300
iteration: 350
[[9841775341 7819741846 5059349793]
 [8637714354 1418554031 5801802830]
 [3728772580 6440733146 3149151507]]
Generating submission file ...
