In [2]:
import sklearn
import numpy as np
import pandas as pd

In [3]:
# import data
train = pd.read_csv('../data/raw/Emergent_NAACL2016/emergent/url-versions-2015-06-14-clean-train.csv')
test = pd.read_csv('../data/raw/Emergent_NAACL2016/emergent/url-versions-2015-06-14-clean-test.csv')

feat_Qmark = pd.read_csv('../data/processed/features/headline_Qmark.csv').drop_duplicates()

feat_BoGF= pd.read_csv('../data/processed/features/headline_BoWF.csv').drop_duplicates()


In [4]:
target_map = {"for": 0, "observing": 1, "against": 2}

In [5]:
train = train.replace({"articleHeadlineStance": target_map})

In [6]:
test = test.replace({"articleHeadlineStance": target_map})

In [7]:
train["articleHeadlineStance"].value_counts()

0    992
1    775
2    304
Name: articleHeadlineStance, dtype: int64

In [8]:
train = train.merge(feat_Qmark, on="articleId").merge(feat_BoGF, on="articleId").drop("Unnamed: 0", axis=1)
test = test.merge(feat_Qmark, on="articleId").merge(feat_BoGF, on="articleId").drop("Unnamed: 0", axis=1)

In [9]:
train["articleHeadlineStance"] = train["articleHeadlineStance"].astype("int")

In [10]:
test["articleHeadlineStance"] = test["articleHeadlineStance"].astype("int")

In [11]:
# merge datasets and shuuffle with random seed
seed = 1234

train = sklearn.utils.shuffle(train, random_state=seed)
test = sklearn.utils.shuffle(test, random_state=seed)


In [12]:
# create train and test

X_train = train.to_numpy()[:,5:]
Y_train = train.to_numpy()[:,2].reshape((-1,))

X_test = test.to_numpy()[:,5:]
Y_test = test.to_numpy()[:,2].reshape((-1,))

In [13]:
Y_train = Y_train.astype(int)

In [14]:
X_train = X_train.astype(float)

In [15]:
X_test = X_test.astype(float)

In [16]:
y_test = Y_test.astype(int)

In [17]:
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(2071, 20552)
(2071,)
(524, 20552)
(524,)


In [18]:
from sklearn.model_selection import StratifiedKFold

In [19]:
train["articleHeadlineStance"].value_counts()

0    992
1    775
2    304
Name: articleHeadlineStance, dtype: int64

In [20]:
train["articleHeadlineStance"].value_counts(normalize=True) * 100

0    47.899565
1    37.421535
2    14.678899
Name: articleHeadlineStance, dtype: float64

In [21]:
test['articleHeadlineStance'].value_counts(normalize=True) * 100

0    46.946565
1    35.687023
2    17.366412
Name: articleHeadlineStance, dtype: float64

In [22]:
skf = StratifiedKFold(n_splits=5, )

In [23]:
skf.get_n_splits(X_train, Y_train)

5

In [66]:
train_folds = []
valid_folds = []
for i, (train_index, test_index) in enumerate(skf.split(X_train, Y_train)):
    print("TRAIN:", train_index.shape, "TEST:", test_index.shape)
    x_train_fold, x_valid_fold = X_train[train_index], X_train[test_index]
    y_train_fold, y_valid_fold = Y_train[train_index], Y_train[test_index]
    bc = np.bincount(y_train_fold)
    ii = np.nonzero(bc)[0]
    print("train set distribution")
    print(list(zip(ii,bc[ii])))
    bc = np.bincount(y_valid_fold)
    ii = np.nonzero(bc)[0]
    print("validation set distribution")
    print(list(zip(ii,bc[ii])))
    print("----------------------------")
    train_fold = np.concatenate((x_train_fold, y_train_fold.reshape(-1, 1)), axis=1)
    valid_fold = np.concatenate((x_valid_fold, y_valid_fold.reshape(-1, 1)), axis=1)
    train_folds.append(train_fold)
    valid_folds.append(valid_fold)

TRAIN: (1656,) TEST: (415,)
train set distribution
[(0, 793), (1, 620), (2, 243)]
validation set distribution
[(0, 199), (1, 155), (2, 61)]
----------------------------
TRAIN: (1657,) TEST: (414,)
train set distribution
[(0, 794), (1, 620), (2, 243)]
validation set distribution
[(0, 198), (1, 155), (2, 61)]
----------------------------
TRAIN: (1657,) TEST: (414,)
train set distribution
[(0, 794), (1, 620), (2, 243)]
validation set distribution
[(0, 198), (1, 155), (2, 61)]
----------------------------
TRAIN: (1657,) TEST: (414,)
train set distribution
[(0, 794), (1, 620), (2, 243)]
validation set distribution
[(0, 198), (1, 155), (2, 61)]
----------------------------
TRAIN: (1657,) TEST: (414,)
train set distribution
[(0, 793), (1, 620), (2, 244)]
validation set distribution
[(0, 199), (1, 155), (2, 60)]
----------------------------


In [31]:
np.savez("../data/processed/train/train_folds.npz", *train_folds)

In [42]:
np.savez("../data/processed/valid/valid_folds.npz", *valid_folds)

In [46]:
folds = np.load('../data/processed/valid/valid_folds.npz')

In [64]:
for k in folds:
    valid_x = folds[k][:, :-1]
    valid_y = folds[k][:, -1:]
    print(valid_x.shape)
    print(valid_y.shape)

(415, 20552)
(415, 1)
(414, 20552)
(414, 1)
(414, 20552)
(414, 1)
(414, 20552)
(414, 1)
(414, 20552)
(414, 1)
