In [16]:
import pandas as pd
from sklearn.neighbors import NearestCentroid
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn.metrics import confusion_matrix, accuracy_score

In [17]:
dataset = pd.read_csv('data/twitter_dataset.csv', encoding = 'latin-1')
dataset.head()

Unnamed: 0,name_wt,statuses_count,followers_count,friends_count,favourites_count,listed_count,label
0,0.857143,1211,24,63,6,6,0
1,0.25,313,101,173,48,0,0
2,0.411765,247,51,382,50,4,0
3,0.611111,41,3,36,5,0,0
4,0.6,12341,770,1417,0,8,1


In [18]:
features=[]
for attributes in dataset.columns:
    if attributes != 'label':
        features.append(attributes)
features

['name_wt',
 'statuses_count',
 'followers_count',
 'friends_count',
 'favourites_count',
 'listed_count']

In [19]:
#split dataset in features and target variable
X = dataset[features] # Features
y = dataset.label # Target variable

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

In [39]:
clf = NearestCentroid(shrink_threshold=0.2)
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_predict = clf.predict(X_test)

## Perormance evaluation of the Model


In [40]:
#true negatives is C(0,0), false negatives is C(1,0), false positives is C(0,1) and true positives is C(1,1) 
conf_matrix = confusion_matrix(y_test, y_predict)

In [41]:
#true_negative
TN = conf_matrix[0][0]
#false_negative
FN = conf_matrix[1][0]
#false_positive
FP = conf_matrix[0][1]
#true_positive
TP = conf_matrix[1][1]

In [42]:
# Recall is the ratio of the total number of correctly classified positive examples divided by the total number of positive examples. 
# High Recall indicates the class is correctly recognized (small number of FN)
recall = (TP)/(TP + FN)

In [43]:
# Precision is the the total number of correctly classified positive examples divided by the total number of predicted positive examples. 
# High Precision indicates an example labeled as positive is indeed positive (small number of FP)
precision = (TP)/(TP + FP)

In [44]:
fmeasure = (2*recall*precision)/(recall+precision)
accuracy = (TP + TN)/(TN + FN + FP + TP)
# accuracy_score(y_test, y_predict)

In [45]:
print("------ CLASSIFICATION PERFORMANCE OF NEAREST-CENTROID MODEL ------ "\
      "\n Recall : ", (recall*100) ,"%" \
      "\n Precision : ", (precision*100) ,"%" \
      "\n Accuracy : ", (accuracy*100) ,"%" \
      "\n F-measure : ", (fmeasure*100) ,"%" )


------ CLASSIFICATION PERFORMANCE OF K-NEAREST-NEIGHBORS MODEL ------ 
 Recall :  93.49282296650718 %
 Precision :  61.21553884711779 %
 Accuracy :  67.03454894433781 %
 F-measure :  73.9871260886028 %
