# Classification

## Logistic Regression

In [7]:
def logistic_mod(df, logProb = 1.0):
    from sklearn import linear_model

    ## Prepare data for model
    nrow = df.shape[0]
    X = df[['x', 'y']].as_matrix().reshape(nrow,2)
    Y = df.z.as_matrix().ravel() #reshape(nrow,1)
    ## Compute the logistic regression model
    lg = linear_model.LogisticRegression()
    logr = lg.fit(X, Y)
    ## Compute the y values
    temp = logr.predict_log_proba(X)  
    df['predicted']  = [1 if (logProb > p[1]/p[0]) else 0 for p in temp]
    return df

def eval_logistic(df):
    import matplotlib.pyplot as plt
    import pandas as pd

    truePos = df[((df['predicted'] == 1) & (df['z'] == df['predicted']))]  
    falsePos = df[((df['predicted'] == 1) & (df['z'] != df['predicted']))] 
    trueNeg = df[((df['predicted'] == 0) & (df['z'] == df['predicted']))]  
    falseNeg = df[((df['predicted'] == 0) & (df['z'] != df['predicted']))]

    fig = plt.figure(figsize=(5, 5))
    fig.clf()
    ax = fig.gca()
    truePos.plot(kind = 'scatter', x = 'x', y = 'y', ax = ax, 
                       alpha = 1.0, color = 'DarkBlue', marker = '+', s = 80) 
    falsePos.plot(kind = 'scatter', x = 'x', y = 'y', ax = ax, 
                       alpha = 1.0, color = 'Red', marker = 'o', s = 40)  
    trueNeg.plot(kind = 'scatter', x = 'x', y = 'y', ax = ax, 
                       alpha = 1.0, color = 'DarkBlue', marker = 'o', s = 40)  
    falseNeg.plot(kind = 'scatter', x = 'x', y = 'y', ax = ax, 
                       alpha = 1.0, color = 'Red', marker = '+', s = 80) 
    ax.set_xlabel('X')
    ax.set_ylabel('Y')
    ax.set_title('Classes vs X and Y')
    
    TP = truePos.shape[0]
    FP = falsePos.shape[0]
    TN = trueNeg.shape[0]
    FN = falseNeg.shape[0]
       
    confusion = pd.DataFrame({'Positive': [FP, TP],
                              'Negative': [TN, FN]},
                               index = ['TrueNeg', 'TruePos'])
    accuracy = float(TP + TN)/float(TP + TN + FP + FN)      
    precision = float(TP)/float(TP + FP)     
    recall =  float(TP)/float(TP + FN)      
    
    print(confusion)
    print('accracy = ' + str(accuracy))
    print('precision = ' + str(precision))
    print('recall = ' + str(recall))
    
    return 'Done'

## K Nearest Neighbors Classification
SKlearn prepartions your data using kd-trees. Prediction based on majority vote; probably best to use an odd number of k-neighbors with binary classification.

Choose best fitting distance measure. Be carefull wiht **imbalanced labeled dataset** when choosing you distance measure: in this case using uniform or user-defined weighting may be better.

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import euclidean_distances

In [9]:
data   = [[0],[1],[2],[3],[4], [5],[6],[7],[8],[9]]  # input dataframe samples
labels = [0,0,0,0,0, 1,1,1,1,1]  # the function we're training is " >4 "
data_train, data_test, label_train, label_test = train_test_split(data, labels, test_size=0.5, random_state=7)

In [10]:
model = KNeighborsClassifier(n_neighbors=3)
model.fit(data_train, label_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [11]:
predictions = model.predict(data_test)
predictions

array([1, 0, 0, 0, 0])

In [12]:
model.predict_proba(data_test)

array([[ 0.        ,  1.        ],
       [ 0.66666667,  0.33333333],
       [ 0.66666667,  0.33333333],
       [ 0.66666667,  0.33333333],
       [ 0.66666667,  0.33333333]])

In [13]:
#Model evaluation
model.score(data_test, label_test)

0.80000000000000004