In [10]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt
from sklearn import linear_model

In [11]:
def small_cells(df, x_step , y_step):
    min_x = np.min(df['x'])
    min_y = np.min(df['y'])
    num_x = 2.0/x_step
    num_y = 2.0/y_step
    print min_x, min_y, num_x, num_y
    xs = df['x'].values
    ys = df['y'].values
    cell_id = []
    for i in range(np.shape(xs)[0]):
        id = int(int((xs[i]-min_x)/x_step) * num_y + int((ys[i]-min_y)/y_step))
        #print id
        cell_id.append(id)
    df['cell_id'] = cell_id
    return df

In [12]:
def process_one_cell(train, test, cell_id):
    train = train.loc[train.cell_id == cell_id]
    test = test.loc[test.cell_id == cell_id]
    test_index = test.index
    
    le = preprocessing.LabelEncoder()
    Y = le.fit_transform(train.place_id.values)
    X = train.drop(['place_id','time','cell_id'], axis=1).values.astype(int)
    test = test.drop(['place_id','time','cell_id'], axis = 1).values.astype(int)
    
    #Applying the classifier
    clf = linear_model.LogisticRegression()
    clf.fit(X, Y)
    pred = clf.predict(test)
    pred_labels = le.inverse_transform(pred)    
    return pred_labels, test_index

In [13]:
def process_all(train, test, cells):
    pred = np.zeros(np.shape(test)[0])
    truth = test.place_id.values
    for i in range(cells):
        pred_labels, index = process_one_cell(train, test, i)
        pred[index] = pred_labels
        print pred
    correct = (truth == pred)
    accuracy = np.mean(correct)
    return accuracy

In [14]:
base_path = "/Users/ludai/Desktop/Github/fda/Assignment3/src/"
x_step = 0.25
y_step = 0.125
#fw = [3000, 4200, 100, 80, 0.1, 30, 100, 1] #feature weights
fw = [500, 1000, 4, 3, 1./22., 2, 10]
#fw = [1, 1, 1, 1, 1, 1, 1]
th = 5 #Keeping place_ids with more than th samples.
accuracy = np.zeros(9, dtype=np.float)
for index in range(1):
    string  =  'subset%s.csv' % index
    data_path = base_path + string
    #print data_path
    df = pd.read_csv(data_path, usecols=['row_id','x','y','time','place_id'], index_col = 0)
    print np.shape(df)
    df = small_cells(df,x_step,y_step)
    #print df['cell_id'].value_counts()
    
    initial_date = np.datetime64('2014-01-01T01:01', dtype='datetime64[m]') 
    d_times = pd.DatetimeIndex(initial_date + np.timedelta64(t, 'm') for t in df.time.values)
    #get info from time field
    df['hour'] = d_times.hour * fw[2]
    df['weekday'] = d_times.weekday * fw[3]
    df['day'] = (d_times.dayofyear * fw[4]).astype(int)
    df['month'] = d_times.month * fw[5]
    df['year'] = (d_times.year - 2013) * fw[6]
    #df = df.drop(['time'], axis=1) 
    df.x = df.x.values * fw[0]
    df.y = df.y.values * fw[1]

    place_counts = df.place_id.value_counts()
    #print np.shape(place_counts)
    #drop rows with place_id which has samples < th
    mask = (place_counts[df.place_id.values] >= th).values
    #print np.shape(mask)
    df = df.loc[mask]

    #split data into training and testing sets
    #df_train = df.sample(frac=0.7)
    #df_test = df.loc[~df.index.isin(df_train.index)]
    df_test = df[df['time'] > 600000].sample(n=200000)
    df_train = df.loc[~df.index.isin(df_test.index)]
    
    df_test.index = [i for i in range(np.shape(df_test)[0])]

    print np.shape(df), np.shape(df_train), np.shape(df_test)
    accuracy[index] = process_all(df_train, df_test, 128)
    print accuracy[index]
print accuracy
    

(1048575, 4)
3.3312 4.1807 8.0 16.0
(1033175, 10) (833175, 10) (200000, 10)
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.  0.]
[ 0.  0.  0. ...,  0.  0.