In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix, accuracy_score

In [2]:
dataset = pd.read_csv('../data/facebook_dataset.csv', encoding = 'latin-1')
dataset.head()           #show first 5 rows

Unnamed: 0,name_wt,statuses_count,followers_count,friends_count,favourites_count,listed_count,label
0,0.1875,20370,5470,2385,145,52,0
1,0.142857,3131,506,381,9,40,0
2,1.0,4024,264,87,323,16,0
3,0.75,40586,640,622,1118,32,0
4,0.5,2016,62,64,13,0,0


In [3]:
#Combinig attributes into single list of tuples and using those features create a 2D matrix 

features = ['name_wt','statuses_count', 'followers_count', 'friends_count','favourites_count','listed_count']
data = dataset.as_matrix(columns = features)

  after removing the cwd from sys.path.


In [4]:
data

array([[1.87500000e-01, 2.03700000e+04, 5.47000000e+03, 2.38500000e+03,
        1.45000000e+02, 5.20000000e+01],
       [1.42857143e-01, 3.13100000e+03, 5.06000000e+02, 3.81000000e+02,
        9.00000000e+00, 4.00000000e+01],
       [1.00000000e+00, 4.02400000e+03, 2.64000000e+02, 8.70000000e+01,
        3.23000000e+02, 1.60000000e+01],
       ...,
       [9.16666667e-01, 2.00000000e+00, 0.00000000e+00, 1.50000000e+01,
        0.00000000e+00, 0.00000000e+00],
       [9.33333333e-01, 2.00000000e+00, 0.00000000e+00, 1.60000000e+01,
        0.00000000e+00, 0.00000000e+00],
       [9.33333333e-01, 0.00000000e+00, 0.00000000e+00, 1.70000000e+01,
        0.00000000e+00, 0.00000000e+00]])

In [5]:
print("Total instances : ", data.shape[0], "\nNumber of features : ", data.shape[1])

Total instances :  2818 
Number of features :  6


In [6]:
#convert label column into 1D arrray

label = np.array(dataset['label'])
label

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

## Test and Train Split

Using 80-20 split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=0)

In [8]:
print("Number of training instances: ", X_train.shape[0])

Number of training instances:  2254


In [9]:
print("Number of testing instances: ", X_test.shape[0])

Number of testing instances:  564


## Training the Model

In [10]:
# Generate the model
neighbors = 5
knn_model = KNeighborsClassifier(n_neighbors = neighbors)

# Train the model using the training sets
data = X_train
label = y_train

knn_model.fit(data, label)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

## Testing the Model

Now our model is ready. We will test our data against given labels.

In [11]:
#test set
X_test

array([[5.83333333e-01, 1.00000000e+00, 2.00000000e+00, 2.10000000e+01,
        0.00000000e+00, 0.00000000e+00],
       [7.64705882e-01, 4.93000000e+02, 3.00000000e+02, 3.10000000e+02,
        3.00000000e+00, 6.00000000e+00],
       [7.77777778e-01, 2.88080000e+04, 2.63800000e+03, 1.74300000e+03,
        1.02100000e+03, 3.90000000e+01],
       ...,
       [8.57142857e-01, 4.20000000e+01, 2.93000000e+02, 1.52700000e+03,
        0.00000000e+00, 6.00000000e+00],
       [4.54545455e-01, 3.00000000e+00, 1.00000000e+00, 4.20000000e+01,
        0.00000000e+00, 0.00000000e+00],
       [9.09090909e-01, 4.50000000e+01, 2.00000000e+01, 3.70000000e+02,
        0.00000000e+00, 0.00000000e+00]])

In [12]:
knn_model.predict([X_test[1]])    #testing for single instance

array([0], dtype=int64)

In [13]:
'''
   Now, apply the model to the entire test set and predict the label for each test example

'''       
       
y_predict = []                       #to store prediction of each test example

for test_case in range(len(X_test)): 
    label = knn_model.predict([X_test[test_case]])
    
    #append to the predictions list
    y_predict.append(np.asscalar(label))

#predictions

In [14]:
y_predict

[1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,


## Perormance evaluation of the Model

In [15]:
#true negatives is C(0,0), false negatives is C(1,0), false positives is C(0,1) and true positives is C(1,1) 
conf_matrix = confusion_matrix(y_test, y_predict)

In [16]:
#true_negative
TN = conf_matrix[0][0]
#false_negative
FN = conf_matrix[1][0]
#false_positive
FP = conf_matrix[0][1]
#true_positive
TP = conf_matrix[1][1]

In [17]:
# Recall is the ratio of the total number of correctly classified positive examples divided by the total number of positive examples. 
# High Recall indicates the class is correctly recognized (small number of FN)
recall = (TP)/(TP + FN)

In [18]:
# Precision is the the total number of correctly classified positive examples divided by the total number of predicted positive examples. 
# High Precision indicates an example labeled as positive is indeed positive (small number of FP)
precision = (TP)/(TP + FP)

In [19]:
fmeasure = (2*recall*precision)/(recall+precision)
accuracy = (TP + TN)/(TN + FN + FP + TP)
#accuracy_score(y_test, y_predict)

In [20]:
print("------ CLASSIFICATION PERFORMANCE OF THE NAIVE BAYES MODEL ------ "\
      "\n Recall : ", (recall*100) ,"%" \
      "\n Precision : ", (precision*100) ,"%" \
      "\n Accuracy : ", (accuracy*100) ,"%" \
      "\n F-measure : ", (fmeasure*100) ,"%" )



------ CLASSIFICATION PERFORMANCE OF THE NAIVE BAYES MODEL ------ 
 Recall :  98.83268482490273 %
 Precision :  97.31800766283524 %
 Accuracy :  98.22695035460993 %
 F-measure :  98.06949806949807 %
