In [1]:
#Todo: hyperparameter optimization, tesing on different files

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import svm, tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score

In [3]:
def read_csv_file (filename):
    df = pd.read_csv(filename)  # read the csv file
    return df

In [4]:
def convert_label_to_numeric(label):
    converted_label = np.empty(len(label), dtype=object) 
    for i in range(len(label)):
        if label[i] == "ONLY_ONE":
            converted_label[i] = 1
        elif label[i] == "NEUTRAL":
            converted_label[i] = 0
        else: 
            converted_label[i] = 2
    converted_label = converted_label.astype('int')
    return converted_label

In [5]:
def apply_PCA(train, test):
    # Since PCA is effected by scale, we need to scale the features in the data before applying PCA
    scaler = StandardScaler()
    # Fit on training set only.
    scaler.fit(train)
    # Apply transform to both the training set and the test set.
    train = scaler.transform(train)
    test = scaler.transform(test)

    # Make an instance of the Model
    pca = PCA(.95) #  choose the minimum number of principal components such that 95% of the variance is retained.
    # We are fitting PCA on the training set only.
    pca.fit(train)
    print ("Number of selected components: ", pca.n_components_)
    #print (pd.DataFrame(pca.components_))
    
    # Apply the mapping (transform) to both the training set and the test set
    print("Before applying PCA train set size: ", train.shape)
    print("Before applying PCA test set size: ", test.shape)
    train = pca.transform(train)
    test = pca.transform(test)
    print("After applying PCA train set size: ", train.shape)
    print("After applying PCA test set size: ", test.shape)
    return train,test

In [6]:
def measure_performance(true_label, predicted_label):   
    report = classification_report(true_label, predicted_label, digits=3)
    recall = recall_score(true_label, predicted_label, average="macro")
    precision = precision_score(true_label, predicted_label, average="macro")
    f1 = f1_score(true_label, predicted_label, average="macro")
    return recall, precision, f1

In [7]:
data = pd.read_csv('..//updated_data//RAW_DATASETS//COLOCATED_MOZILLA.csv') 
print("Initial data shape: ", data.shape)
true_label = data['COLOCATED_STATUS']
true_label = convert_label_to_numeric(true_label)
data = data.drop(columns=['FILE_PATH', 'ICP_STATUS', 'COLOCATED_STATUS', 'SAME_DIFF_STATUS'])

Initial data shape:  (1613, 16)


In [8]:
# 10 fold cv
kf = KFold(n_splits=10, shuffle = True, random_state = 7)

cv_recall_DT = []
cv_precision_DT = []
cv_f1_DT = []

cv_recall_KNN = []
cv_precision_KNN = []
cv_f1_KNN = []

cv_recall_SVM = []
cv_precision_SVM = []
cv_f1_SVM = []

cv_recall_NB = []
cv_precision_NB = []
cv_f1_NB = []

cv_recall_RF = []
cv_precision_RF = []
cv_f1_RF = []


for train_index, test_index in kf.split(data):
    train, test = data.loc[train_index], data.loc[test_index]
    train_label, test_label = true_label[train_index], true_label[test_index]

    train, test = apply_PCA(train, test)
    
    clf = tree.DecisionTreeClassifier()
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    cv_recall_DT.append(recall)
    cv_precision_DT.append(precision)
    cv_f1_DT.append(f1)
    
    clf = KNeighborsClassifier(n_neighbors=3)
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    cv_recall_KNN.append(recall)
    cv_precision_KNN.append(precision)
    cv_f1_KNN.append(f1)
    
    clf = svm.SVC(gamma='scale')
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    cv_recall_SVM.append(recall)
    cv_precision_SVM.append(precision)
    cv_f1_SVM.append(f1)

    clf = GaussianNB()
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    cv_recall_NB.append(recall)
    cv_precision_NB.append(precision)
    cv_f1_NB.append(f1)

    clf = RandomForestClassifier(n_estimators=100, max_depth=2, random_state=0)
    clf.fit(train, train_label)
    predicted_label = clf.predict(test)
    recall, precision, f1 = measure_performance(test_label, predicted_label)
    cv_recall_RF.append(recall)
    cv_precision_RF.append(precision)
    cv_f1_RF.append(f1)


Number of selected components:  8
Before applying PCA train set size:  (1451, 12)
Before applying PCA test set size:  (162, 12)
After applying PCA train set size:  (1451, 8)
After applying PCA test set size:  (162, 8)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Number of selected components:  8
Before applying PCA train set size:  (1451, 12)
Before applying PCA test set size:  (162, 12)
After applying PCA train set size:  (1451, 8)
After applying PCA test set size:  (162, 8)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Number of selected components:  8
Before applying PCA train set size:  (1451, 12)
Before applying PCA test set size:  (162, 12)
After applying PCA train set size:  (1451, 8)
After applying PCA test set size:  (162, 8)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Number of selected components:  8
Before applying PCA train set size:  (1452, 12)
Before applying PCA test set size:  (161, 12)
After applying PCA train set size:  (1452, 8)
After applying PCA test set size:  (161, 8)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Number of selected components:  8
Before applying PCA train set size:  (1452, 12)
Before applying PCA test set size:  (161, 12)
After applying PCA train set size:  (1452, 8)
After applying PCA test set size:  (161, 8)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Number of selected components:  8
Before applying PCA train set size:  (1452, 12)
Before applying PCA test set size:  (161, 12)
After applying PCA train set size:  (1452, 8)
After applying PCA test set size:  (161, 8)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Number of selected components:  8
Before applying PCA train set size:  (1452, 12)
Before applying PCA test set size:  (161, 12)
After applying PCA train set size:  (1452, 8)
After applying PCA test set size:  (161, 8)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Number of selected components:  8
Before applying PCA train set size:  (1452, 12)
Before applying PCA test set size:  (161, 12)
After applying PCA train set size:  (1452, 8)
After applying PCA test set size:  (161, 8)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Number of selected components:  8
Before applying PCA train set size:  (1452, 12)
Before applying PCA test set size:  (161, 12)
After applying PCA train set size:  (1452, 8)
After applying PCA test set size:  (161, 8)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Number of selected components:  8
Before applying PCA train set size:  (1452, 12)
Before applying PCA test set size:  (161, 12)
After applying PCA train set size:  (1452, 8)
After applying PCA test set size:  (161, 8)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
print("-------DT-------")
print("Recall:", np.mean(cv_recall_DT))
print("Precision:", np.mean(cv_precision_DT))
print("f1 score:", np.mean(cv_f1_DT))

print("-------KNN-------")
print("Recall:", np.mean(cv_recall_KNN))
print("Precision:", np.mean(cv_precision_KNN))
print("f1 score:", np.mean(cv_f1_KNN))

print("-------SVM-------")
print("Recall:", np.mean(cv_recall_SVM))
print("Precision:", np.mean(cv_precision_SVM))
print("f1 score:", np.mean(cv_f1_SVM))

print("-------NB-------")
print("Recall:", np.mean(cv_recall_NB))
print("Precision:", np.mean(cv_precision_NB))
print("f1 score:", np.mean(cv_f1_NB))

print("-------RF-------")
print("Recall:", np.mean(cv_recall_RF))
print("Precision:", np.mean(cv_precision_RF))
print("f1 score:", np.mean(cv_f1_RF))

-------DT-------
Recall: 0.8973352641070573
Precision: 0.8485874885534228
f1 score: 0.8627105337182703
-------KNN-------
Recall: 0.5389528992742052
Precision: 0.5963908864225037
f1 score: 0.5576133249710711
-------SVM-------
Recall: 0.4554005229336826
Precision: 0.7268604675225987
f1 score: 0.49298589022426204
-------NB-------
Recall: 0.4417971611795387
Precision: 0.4819069337609035
f1 score: 0.4465596739043945
-------RF-------
Recall: 0.34305555555555556
Precision: 0.37426550679062986
f1 score: 0.31829863469749864
