In [1]:
import sklearn
import numpy as np
import pandas as pd

In [3]:
#import data
train = pd.read_csv('../../data/raw/Emergent_NAACL2016/emergent/url-versions-2015-06-14-clean-train.csv')
train.drop(columns=["Unnamed: 0"], inplace=True)
test = pd.read_csv('../../data/raw/Emergent_NAACL2016/emergent/url-versions-2015-06-14-clean-test.csv')
test.drop(columns=["Unnamed: 0"], inplace=True)

In [5]:
#create features
align_feature = pd.read_csv('../../data/processed/features/alignment_feature.csv')
cosine_feature = pd.read_csv('../../data/processed/features/cosine_feature.csv')
bow_feature = pd.read_csv('../../data/processed/features/headline_BoW.csv')
qmark_feature = pd.read_csv('../../data/processed/features/headline_Qmark.csv')
neg_alignment_feature = pd.read_csv('../../data/processed/features/neg_alignment_feature.csv')
root_dist_feature = pd.read_csv('../../data/processed/features/root_dist_min.csv')
svo_feature = pd.read_csv('../../data/processed/features/svo_Lexical.csv')

In [6]:
target_map = {"for": 0, "observing": 1, "against": 2}

In [7]:
train = train.replace({"articleHeadlineStance": target_map})

In [8]:
test = test.replace({"articleHeadlineStance": target_map})

In [9]:
train["articleHeadlineStance"].value_counts(),test["articleHeadlineStance"].value_counts()

(0    992
 1    775
 2    304
 Name: articleHeadlineStance, dtype: int64,
 0    246
 1    187
 2     91
 Name: articleHeadlineStance, dtype: int64)

In [10]:
train = pd.merge(train, align_feature, on="articleId")
train = pd.merge(train, cosine_feature, on="articleId")
train = pd.merge(train, bow_feature, on="articleId")
train = pd.merge(train, qmark_feature, on="articleId")
train = pd.merge(train, neg_alignment_feature, on="articleId")
train = pd.merge(train, root_dist_feature, on="articleId")
train = pd.merge(train, svo_feature, on="articleId")

In [11]:
test = pd.merge(test, align_feature, on="articleId")
test = pd.merge(test, cosine_feature, on="articleId")
test = pd.merge(test, bow_feature, on="articleId")
test = pd.merge(test, qmark_feature, on="articleId")
test = pd.merge(test, neg_alignment_feature, on="articleId")
test = pd.merge(test, root_dist_feature, on="articleId")
test = pd.merge(test, svo_feature, on="articleId")

In [12]:
train["articleHeadlineStance"] = train["articleHeadlineStance"].astype("int")

In [13]:
test["articleHeadlineStance"] = test["articleHeadlineStance"].astype("int")

In [14]:
# merge datasets and shuuffle with random seed
seed = 1234

train = sklearn.utils.shuffle(train, random_state=seed)
test = sklearn.utils.shuffle(test, random_state=seed)


In [15]:
# create train and test

X_train = train.to_numpy()[:,5:]
Y_train = train["articleHeadlineStance"].values.reshape((-1,))

X_test = test.to_numpy()[:,5:]
Y_test = test["articleHeadlineStance"].values.reshape((-1,))

In [16]:
Y_train = Y_train.astype(int)
X_train = X_train.astype(float)
X_test = X_test.astype(float)
y_test = Y_test.astype(int)

In [17]:
from sklearn.metrics import confusion_matrix

In [18]:
max_iter = 10e2
C=1
penalty='l1'

# Normal Train and Test

from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty=penalty, C=C, max_iter = max_iter, multi_class='auto', solver="liblinear")
ypred = model.fit(X_train,Y_train).predict(X_test)

print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != ypred).sum()))
print("Accuracy on test set: "+str(model.score(X_test,y_test)))
confusion_matrix(y_test, ypred)

Number of mislabeled points out of a total 524 points : 148
Accuracy on test set: 0.7175572519083969


array([[225,  20,   1],
       [ 79, 100,   8],
       [ 24,  16,  51]])