In [1]:
import sklearn
import numpy as np
import pandas as pd

In [2]:
#import data
train = pd.read_csv('../data/raw/Emergent_NAACL2016/emergent/url-versions-2015-06-14-clean-train.csv')
train.drop(columns=["Unnamed: 0"], inplace=True)
test = pd.read_csv('../data/raw/Emergent_NAACL2016/emergent/url-versions-2015-06-14-clean-test.csv')
test.drop(columns=["Unnamed: 0"], inplace=True)

In [3]:
#create features
align_feature = pd.read_csv('../data/processed/features/alignment_feature.csv')
cosine_feature = pd.read_csv('../data/processed/features/cosine_feature.csv')
bow_feature = pd.read_csv('../data/processed/features/headline_BoWBigram.csv')
qmark_feature = pd.read_csv('../data/processed/features/headline_Qmark.csv')
neg_alignment_feature = pd.read_csv('../data/processed/features/neg_alignment_feature.csv')
root_dist_feature = pd.read_csv('../data/processed/features/root_dist_min.csv')
svo_feature = pd.read_csv('../data/processed/features/svo_Lexical.csv')

In [4]:
target_map = {"for": 0, "observing": 1, "against": 2}

In [5]:
train = train.replace({"articleHeadlineStance": target_map})

In [6]:
test = test.replace({"articleHeadlineStance": target_map})

In [7]:
train["articleHeadlineStance"].value_counts(),test["articleHeadlineStance"].value_counts()

(0    992
 1    775
 2    304
 Name: articleHeadlineStance, dtype: int64, 0    246
 1    187
 2     91
 Name: articleHeadlineStance, dtype: int64)

In [8]:
train = pd.merge(train, align_feature, on="articleId")
train = pd.merge(train, cosine_feature, on="articleId")
train = pd.merge(train, bow_feature, on="articleId")
train = pd.merge(train, qmark_feature, on="articleId")
train = pd.merge(train, neg_alignment_feature, on="articleId")
train = pd.merge(train, root_dist_feature, on="articleId")
train = pd.merge(train, svo_feature, on="articleId")

In [9]:
test = pd.merge(test, align_feature, on="articleId")
test = pd.merge(test, cosine_feature, on="articleId")
test = pd.merge(test, bow_feature, on="articleId")
test = pd.merge(test, qmark_feature, on="articleId")
test = pd.merge(test, neg_alignment_feature, on="articleId")
test = pd.merge(test, root_dist_feature, on="articleId")
test = pd.merge(test, svo_feature, on="articleId")

In [10]:
train["articleHeadlineStance"] = train["articleHeadlineStance"].astype("int")

In [11]:
test["articleHeadlineStance"] = test["articleHeadlineStance"].astype("int")

In [12]:
# merge datasets and shuuffle with random seed
seed = 1234

train = sklearn.utils.shuffle(train, random_state=seed)
test = sklearn.utils.shuffle(test, random_state=seed)


In [13]:
# create train and test

X_train = train.to_numpy()[:,5:]
Y_train = train["articleHeadlineStance"].values.reshape((-1,))

X_test = test.to_numpy()[:,5:]
Y_test = test["articleHeadlineStance"].values.reshape((-1,))

In [14]:
Y_train = Y_train.astype(int)
X_train = X_train.astype(float)
X_test = X_test.astype(float)
y_test = Y_test.astype(int)

In [17]:
from sklearn import metrics
Xt=X_train.T
mi_metric=[]
for i in range(Xt.shape[0]):
    mi_metric.append(metrics.mutual_info_score(list(Xt[i]),list(Y_train)))
        

In [18]:
indices=[]
for i in enumerate(mi_metric):
    if i[1]>0.001:
        indices.append(i[0])
data_X_train = X_train[:, indices]
data_X_test = X_test[:, indices]  

In [15]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

In [16]:
gnb = GaussianNB()

In [19]:
y_pred = gnb.fit(X_train, Y_train).predict(X_test)
#y_pred = gnb.fit(data_X_train, Y_train).predict(data_X_test)

In [21]:
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))
print("Accuracy on test set: "+str(gnb.score(X_test,y_test)*100))
confusion_matrix(y_test, y_pred)

Number of mislabeled points out of a total 524 points : 273
Accuracy on test set: 47.900763358778626


array([[127, 102,  17],
       [ 81,  89,  17],
       [ 21,  35,  35]], dtype=int64)

In [22]:
# need to remove the numeric features to do multi-nomial naive bayes.
mnb = MultinomialNB()
X_train = train.to_numpy()[:,7:]
Y_train = train["articleHeadlineStance"].values.reshape((-1,))
X_test = test.to_numpy()[:,7:]
Y_test = test["articleHeadlineStance"].values.reshape((-1,))
Y_train = Y_train.astype(int)
X_train = X_train.astype(float)
X_test = X_test.astype(float)
y_test = Y_test.astype(int)


y_pred = mnb.fit(X_train, Y_train).predict(X_test)
#y_pred = mnb.fit(data_X_train, Y_train).predict(data_X_test)

In [23]:
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))
print("Accuracy on test set: "+str(100*(X_test.shape[0]-(y_test != y_pred).sum())/X_test.shape[0]))
confusion_matrix(y_test, y_pred)

Number of mislabeled points out of a total 524 points : 213
Accuracy on test set: 59.35114503816794


array([[135,  64,  47],
       [ 33, 114,  40],
       [ 11,  18,  62]], dtype=int64)

In [24]:
max_iter = 10e2
C=1
penalty='l1'

# Normal Train and Test

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty=penalty, C=C, max_iter = max_iter, multi_class='auto', solver="liblinear")
ypred = model.fit(X_train,Y_train).predict(X_test)

print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != ypred).sum()))
print("Accuracy on test set: "+str(model.score(X_test,y_test)))
confusion_matrix(y_test, ypred)

Number of mislabeled points out of a total 524 points : 150
Accuracy on test set: 0.7137404580152672


array([[223,  22,   1],
       [ 78, 101,   8],
       [ 25,  16,  50]], dtype=int64)

In [40]:
from sklearn.model_selection import StratifiedKFold

In [47]:
skf = StratifiedKFold(n_splits=10, )

In [48]:
skf.get_n_splits(X_train, Y_train)

10

In [49]:
train_folds = []
valid_folds = []
for i, (train_index, test_index) in enumerate(skf.split(X_train, Y_train)):
    print("TRAIN:", train_index.shape, "TEST:", test_index.shape)
    x_train_fold, x_valid_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_valid_fold = Y_train[train_index], Y_train[test_index]
    bc = np.bincount(y_train_fold)
    ii = np.nonzero(bc)[0]
    print("train set distribution")
    print(list(zip(ii,bc[ii])))
    bc = np.bincount(y_valid_fold)
    ii = np.nonzero(bc)[0]
    print("validation set distribution")
    print(list(zip(ii,bc[ii])))
    print("----------------------------")
    train_fold = np.concatenate((x_train_fold, y_train_fold.reshape(-1, 1)), axis=1)
    valid_fold = np.concatenate((x_valid_fold, y_valid_fold.reshape(-1, 1)), axis=1)
    train_folds.append(train_fold)
    valid_folds.append(valid_fold)

TRAIN: (1862,) TEST: (209,)
train set distribution
[(0, 892), (1, 697), (2, 273)]
validation set distribution
[(0, 100), (1, 78), (2, 31)]
----------------------------
TRAIN: (1862,) TEST: (209,)
train set distribution
[(0, 892), (1, 697), (2, 273)]
validation set distribution
[(0, 100), (1, 78), (2, 31)]
----------------------------
TRAIN: (1863,) TEST: (208,)
train set distribution
[(0, 893), (1, 697), (2, 273)]
validation set distribution
[(0, 99), (1, 78), (2, 31)]
----------------------------
TRAIN: (1863,) TEST: (208,)
train set distribution
[(0, 893), (1, 697), (2, 273)]
validation set distribution
[(0, 99), (1, 78), (2, 31)]
----------------------------
TRAIN: (1864,) TEST: (207,)
train set distribution
[(0, 893), (1, 697), (2, 274)]
validation set distribution
[(0, 99), (1, 78), (2, 30)]
----------------------------
TRAIN: (1865,) TEST: (206,)
train set distribution
[(0, 893), (1, 698), (2, 274)]
validation set distribution
[(0, 99), (1, 77), (2, 30)]
-------------------------

In [50]:
model = LogisticRegression(penalty=penalty, C=C, max_iter = max_iter, multi_class='auto', solver="liblinear")
running_acc=[]
for k,l in zip(train_folds,valid_folds):
    ypred = model.fit(k[:,:-1],np.ravel(k[:,-1:])).predict((l[:,:-1]))
    print("Number of mislabeled points out of a total %d points : %d" % (l[:,:-1].shape[0], (np.ravel(l[:,-1:]) != ypred).sum()))
    print("Accuracy on test set: "+str(model.score(X_test,y_test)))
    running_acc.append(model.score(X_test,y_test))

Number of mislabeled points out of a total 209 points : 60
Accuracy on test set: 0.6755725190839694
Number of mislabeled points out of a total 209 points : 60
Accuracy on test set: 0.6946564885496184
Number of mislabeled points out of a total 208 points : 62
Accuracy on test set: 0.6812977099236641
Number of mislabeled points out of a total 208 points : 56
Accuracy on test set: 0.6851145038167938
Number of mislabeled points out of a total 207 points : 55
Accuracy on test set: 0.6851145038167938
Number of mislabeled points out of a total 206 points : 60
Accuracy on test set: 0.6889312977099237
Number of mislabeled points out of a total 206 points : 64
Accuracy on test set: 0.6793893129770993
Number of mislabeled points out of a total 206 points : 57
Accuracy on test set: 0.6851145038167938
Number of mislabeled points out of a total 206 points : 64
Accuracy on test set: 0.6812977099236641
Number of mislabeled points out of a total 206 points : 63
Accuracy on test set: 0.683206106870229


In [51]:
sum(running_acc)/len(running_acc)

0.683969465648855