In [2]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import confusion_matrix, accuracy_score

In [3]:
dataset = pd.read_csv('data/facebook_dataset.csv', encoding = 'latin-1')
dataset.head()           #show first 5 rows

Unnamed: 0,name_wt,statuses_count,followers_count,friends_count,favourites_count,listed_count,label
0,0.2,2177,208,332,265,1,0
1,0.4,2660,330,485,3972,5,0
2,0.375,1254,166,177,1185,0,0
3,0.176471,202968,2248,981,60304,101,0
4,0.125,82,21,79,5,0,0


In [4]:
#Combinig attributes into single list of tuples and using those features create a 2D matrix 

features = ['name_wt','statuses_count', 'followers_count', 'friends_count','favourites_count','listed_count']
data = dataset.as_matrix(columns = features)

  after removing the cwd from sys.path.


In [5]:
data

array([[2.000e-01, 2.177e+03, 2.080e+02, 3.320e+02, 2.650e+02, 1.000e+00],
       [4.000e-01, 2.660e+03, 3.300e+02, 4.850e+02, 3.972e+03, 5.000e+00],
       [3.750e-01, 1.254e+03, 1.660e+02, 1.770e+02, 1.185e+03, 0.000e+00],
       ...,
       [4.375e-01, 1.700e+01, 5.300e+02, 3.000e+02, 0.000e+00, 8.000e+00],
       [4.375e-01, 2.000e+00, 2.470e+02, 2.020e+02, 0.000e+00, 1.000e+00],
       [3.500e-01, 4.700e+01, 2.670e+02, 2.340e+02, 0.000e+00, 7.000e+00]])

In [6]:
print("Total instances : ", data.shape[0], "\nNumber of features : ", data.shape[1])

Total instances :  6102 
Number of features :  6


In [7]:
#convert label column into 1D arrray

label = np.array(dataset['label'])
label

array([0, 0, 0, ..., 1, 1, 1])

## Test and Train Split

Using 80-20 split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data, label, test_size=0.2, random_state=0)

In [9]:
print("Number of training instances: ", X_train.shape[0])

Number of training instances:  4881


In [10]:
print("Number of testing instances: ", X_test.shape[0])

Number of testing instances:  1221


## Training the Model

In [11]:
# Generate the model
neighbors = 5
knn_model = KNeighborsClassifier(n_neighbors = neighbors)

# Train the model using the training sets
data = X_train
label = y_train

knn_model.fit(data, label)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

## Testing the Model

Now our model is ready. We will test our data against given labels.

In [12]:
#test set
X_test

array([[5.33333333e-01, 3.34790000e+04, 4.98000000e+02, 3.04000000e+02,
        1.34150000e+04, 3.00000000e+00],
       [1.42857143e-01, 1.83300000e+03, 5.90000000e+01, 1.33000000e+02,
        1.20000000e+02, 2.00000000e+00],
       [2.00000000e-01, 1.93700000e+03, 8.87000000e+02, 7.43000000e+02,
        5.10000000e+01, 1.00000000e+00],
       ...,
       [2.50000000e-01, 1.77000000e+02, 3.00000000e+00, 3.20000000e+01,
        8.15000000e+02, 0.00000000e+00],
       [8.33333333e-01, 3.23000000e+02, 1.14200000e+03, 4.96000000e+02,
        0.00000000e+00, 1.72000000e+02],
       [7.64705882e-01, 1.30000000e+01, 1.58000000e+02, 5.74000000e+02,
        0.00000000e+00, 2.00000000e+00]])

In [13]:
knn_model.predict([X_test[1]])    #testing for single instance

array([0])

In [14]:
'''
   Now, apply the model to the entire test set and predict the label for each test example

'''       
       
y_predict = []                       #to store prediction of each test example

for test_case in range(len(X_test)): 
    label = knn_model.predict([X_test[test_case]])
    
    #append to the predictions list
    y_predict.append(np.asscalar(label))

#predictions

In [15]:
y_predict

[0,
 0,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 1,
 0,


## Perormance evaluation of the Model

In [16]:
#true negatives is C(0,0), false negatives is C(1,0), false positives is C(0,1) and true positives is C(1,1) 
conf_matrix = confusion_matrix(y_test, y_predict)

In [17]:
#true_negative
TN = conf_matrix[0][0]
#false_negative
FN = conf_matrix[1][0]
#false_positive
FP = conf_matrix[0][1]
#true_positive
TP = conf_matrix[1][1]

In [18]:
# Recall is the ratio of the total number of correctly classified positive examples divided by the total number of positive examples. 
# High Recall indicates the class is correctly recognized (small number of FN)
recall = (TP)/(TP + FN)

In [19]:
# Precision is the the total number of correctly classified positive examples divided by the total number of predicted positive examples. 
# High Precision indicates an example labeled as positive is indeed positive (small number of FP)
precision = (TP)/(TP + FP)

In [20]:
fmeasure = (2*recall*precision)/(recall+precision)
accuracy = (TP + TN)/(TN + FN + FP + TP)
#accuracy_score(y_test, y_predict)

In [21]:
print("------ CLASSIFICATION PERFORMANCE OF THE NAIVE BAYES MODEL ------ "\
      "\n Recall : ", (recall*100) ,"%" \
      "\n Precision : ", (precision*100) ,"%" \
      "\n Accuracy : ", (accuracy*100) ,"%" \
      "\n F-measure : ", (fmeasure*100) ,"%" )



------ CLASSIFICATION PERFORMANCE OF THE NAIVE BAYES MODEL ------ 
 Recall :  94.58413926499033 %
 Precision :  95.88235294117648 %
 Accuracy :  95.98689598689599 %
 F-measure :  95.22882181110029 %
