In [346]:
import sklearn
import numpy as np
import pandas as pd

In [368]:
#import data
train = pd.read_csv('../data/raw/Emergent_NAACL2016/emergent/url-versions-2015-06-14-clean-train.csv')
train.drop(columns=["Unnamed: 0"], inplace=True)
test = pd.read_csv('../data/raw/Emergent_NAACL2016/emergent/url-versions-2015-06-14-clean-test.csv')
test.drop(columns=["Unnamed: 0"], inplace=True)

In [348]:
#create features
align_feature = pd.read_csv('../data/processed/features/alignment_feature.csv')
cosine_feature = pd.read_csv('../data/processed/features/cosine_feature.csv')
bow_feature = pd.read_csv('../data/processed/features/headline_BoWBigram.csv')
qmark_feature = pd.read_csv('../data/processed/features/headline_Qmark.csv')
neg_alignment_feature = pd.read_csv('../data/processed/features/neg_alignment_feature.csv')
root_dist_feature = pd.read_csv('../data/processed/features/root_dist_min.csv')
svo_feature = pd.read_csv('../data/processed/features/svo.csv')

In [369]:
target_map = {"for": 0, "observing": 1, "against": 2}

In [370]:
train = train.replace({"articleHeadlineStance": target_map})

In [371]:
test = test.replace({"articleHeadlineStance": target_map})

In [372]:
train["articleHeadlineStance"].value_counts(),test["articleHeadlineStance"].value_counts()


(0    992
 1    775
 2    304
 Name: articleHeadlineStance, dtype: int64, 0    246
 1    187
 2     91
 Name: articleHeadlineStance, dtype: int64)

In [373]:
train = pd.merge(train, align_feature, on="articleId")
train = pd.merge(train, cosine_feature, on="articleId")
train = pd.merge(train, bow_feature, on="articleId")
train = pd.merge(train, qmark_feature, on="articleId")
train = pd.merge(train, neg_alignment_feature, on="articleId")
train = pd.merge(train, root_dist_feature, on="articleId")
train = pd.merge(train, svo_feature, on="articleId")

In [374]:
test = pd.merge(test, align_feature, on="articleId")
test = pd.merge(test, cosine_feature, on="articleId")
test = pd.merge(test, bow_feature, on="articleId")
test = pd.merge(test, qmark_feature, on="articleId")
test = pd.merge(test, neg_alignment_feature, on="articleId")
test = pd.merge(test, root_dist_feature, on="articleId")
test = pd.merge(test, svo_feature, on="articleId")

In [375]:
train["articleHeadlineStance"] = train["articleHeadlineStance"].astype("int")

In [376]:
test["articleHeadlineStance"] = test["articleHeadlineStance"].astype("int")

In [377]:
# merge datasets and shuuffle with random seed
seed = 1234

train = sklearn.utils.shuffle(train, random_state=seed)
test = sklearn.utils.shuffle(test, random_state=seed)


In [378]:
# create train and test

X_train = train.to_numpy()[:,5:]
Y_train = train["articleHeadlineStance"].values.reshape((-1,))

X_test = test.to_numpy()[:,5:]
Y_test = test["articleHeadlineStance"].values.reshape((-1,))

In [379]:
Y_train = Y_train.astype(int)
X_train = X_train.astype(float)
X_test = X_test.astype(float)
y_test = Y_test.astype(int)

In [381]:
X_train.shape

(2071, 20560)

In [253]:
from sklearn import metrics
Xt=X_train.T
mi_metric=[]
for i in range(Xt.shape[0]):
    mi_metric.append(metrics.mutual_info_score(list(Xt[i]),list(Y_train)))
        

In [254]:
indices=[]
for i in enumerate(mi_metric):
    if i[1]>0.001:
        indices.append(i[0])
data_X_train = X_train[:, indices]
data_X_test = X_test[:, indices]  

In [255]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix

In [256]:
gnb = GaussianNB()

In [257]:
#y_pred = gnb.fit(X_train, Y_train).predict(X_test)
y_pred = gnb.fit(data_X_train, Y_train).predict(data_X_test)

In [258]:
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))
print("Accuracy on test set: "+str(gnb.score(data_X_test,y_test)))
confusion_matrix(y_test, y_pred)

Number of mislabeled points out of a total 524 points : 301
Accuracy on test set: 0.4255725190839695


array([[ 77, 169,   0],
       [ 41, 146,   0],
       [ 35,  56,   0]], dtype=int64)

In [259]:
# need to remove the numeric features to do multi-nomial naive bayes.
mnb = MultinomialNB()
X_train = train.to_numpy()[:,7:]
Y_train = train["articleHeadlineStance"].values.reshape((-1,))
X_test = test.to_numpy()[:,7:]
Y_test = test["articleHeadlineStance"].values.reshape((-1,))
Y_train = Y_train.astype(int)
X_train = X_train.astype(float)
X_test = X_test.astype(float)
y_test = Y_test.astype(int)
data_X_train = X_train[:, indices]
data_X_test = X_test[:, indices]  
#y_pred = mnb.fit(X_train, Y_train).predict(X_test)
y_pred = mnb.fit(data_X_train, Y_train).predict(data_X_test)

In [260]:
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))
#print("Accuracy on test set: "+str(mnb.score(data_X_test,y_test)))
confusion_matrix(y_test, y_pred)

Number of mislabeled points out of a total 524 points : 278


array([[246,   0,   0],
       [187,   0,   0],
       [ 91,   0,   0]], dtype=int64)

In [382]:
max_iter = 10e2
C=1
penalty='l1'

# Normal Train and Test

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty=penalty, C=C, max_iter = max_iter, multi_class='auto', solver="liblinear")
#soln = model.fit(data_X_train,Y_train)
ypred = model.fit(X_train,Y_train).predict(X_test)

print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != ypred).sum()))
print("Accuracy on test set: "+str(model.score(X_test,y_test)))
confusion_matrix(y_test, ypred)

Number of mislabeled points out of a total 524 points : 149
Accuracy on test set: 0.7156488549618321


array([[223,  20,   3],
       [ 74, 105,   8],
       [ 25,  19,  47]], dtype=int64)

In [262]:
from sklearn.svm import SVC

In [263]:
clf = SVC(C=.00300,gamma='auto')

In [264]:
clf.fit(X_train, Y_train)


SVC(C=0.003, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [265]:
ypred = clf.fit(X_train,Y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != ypred).sum()))
print("Accuracy on test set: "+str(clf.score(X_test,y_test)))
confusion_matrix(y_test, ypred)

Number of mislabeled points out of a total 524 points : 278
Accuracy on test set: 0.46946564885496184


array([[246,   0,   0],
       [187,   0,   0],
       [ 91,   0,   0]], dtype=int64)

In [266]:
X_train.shape

(2071, 3)

In [267]:
X_test.shape

(524, 3)

In [268]:
X_train

array([[2., 0., 0.],
       [2., 0., 0.],
       [2., 0., 0.],
       ...,
       [2., 0., 0.],
       [2., 0., 0.],
       [2., 0., 0.]])

In [383]:
from sklearn.model_selection import StratifiedKFold

In [384]:
skf = StratifiedKFold(n_splits=5, )

In [385]:
skf.get_n_splits(X_train, Y_train)

5

In [386]:
train_folds = []
valid_folds = []
for i, (train_index, test_index) in enumerate(skf.split(X_train, Y_train)):
    print("TRAIN:", train_index.shape, "TEST:", test_index.shape)
    x_train_fold, x_valid_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_valid_fold = Y_train[train_index], Y_train[test_index]
    bc = np.bincount(y_train_fold)
    ii = np.nonzero(bc)[0]
    print("train set distribution")
    print(list(zip(ii,bc[ii])))
    bc = np.bincount(y_valid_fold)
    ii = np.nonzero(bc)[0]
    print("validation set distribution")
    print(list(zip(ii,bc[ii])))
    print("----------------------------")
    train_fold = np.concatenate((x_train_fold, y_train_fold.reshape(-1, 1)), axis=1)
    valid_fold = np.concatenate((x_valid_fold, y_valid_fold.reshape(-1, 1)), axis=1)
    train_folds.append(train_fold)
    valid_folds.append(valid_fold)

TRAIN: (1656,) TEST: (415,)
train set distribution
[(0, 793), (1, 620), (2, 243)]
validation set distribution
[(0, 199), (1, 155), (2, 61)]
----------------------------
TRAIN: (1656,) TEST: (415,)
train set distribution
[(0, 793), (1, 620), (2, 243)]
validation set distribution
[(0, 199), (1, 155), (2, 61)]
----------------------------
TRAIN: (1657,) TEST: (414,)
train set distribution
[(0, 794), (1, 620), (2, 243)]
validation set distribution
[(0, 198), (1, 155), (2, 61)]
----------------------------
TRAIN: (1657,) TEST: (414,)
train set distribution
[(0, 794), (1, 620), (2, 243)]
validation set distribution
[(0, 198), (1, 155), (2, 61)]
----------------------------
TRAIN: (1658,) TEST: (413,)
train set distribution
[(0, 794), (1, 620), (2, 244)]
validation set distribution
[(0, 198), (1, 155), (2, 60)]
----------------------------


In [387]:
model = LogisticRegression(penalty=penalty, C=C, max_iter = max_iter, multi_class='auto', solver="liblinear")
running_acc=[]
for k,l in zip(train_folds,valid_folds):
    ypred = model.fit(k[:,:-1],np.ravel(k[:,-1:])).predict((l[:,:-1]))
    print("Number of mislabeled points out of a total %d points : %d" % (l[:,:-1].shape[0], (np.ravel(l[:,-1:]) != ypred).sum()))
    print("Accuracy on test set: "+str(model.score(X_test,y_test)))
    running_acc.append(model.score(X_test,y_test))

Number of mislabeled points out of a total 415 points : 100
Accuracy on test set: 0.6946564885496184
Number of mislabeled points out of a total 415 points : 100
Accuracy on test set: 0.6927480916030534
Number of mislabeled points out of a total 414 points : 86
Accuracy on test set: 0.7099236641221374
Number of mislabeled points out of a total 414 points : 94
Accuracy on test set: 0.7061068702290076
Number of mislabeled points out of a total 413 points : 109
Accuracy on test set: 0.7290076335877863


In [345]:
sum(running_acc)/len(running_acc)

0.4694656488549619

In [367]:
sum(running_acc)/len(running_acc)

0.4694656488549619