In [1]:
import sklearn as sk
import numpy as np
import pandas as pd
import os

from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords, wordnet

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler

from preprocessing import preprocess, create_dataframe_for_training
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Bidirectional, Dropout, Concatenate, concatenate, Input, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

seed = 123
np.random.seed(seed)

PREPROCESSING = False
REMOVE_STOP_WORDS = False
THRESHOLD_INFREQUENT_WORDS = 0.01

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  from ._conv import register_converters as _register_converters


In [2]:
PROJECT_DIR = os.getcwd()
DATA_DIR = os.path.join(PROJECT_DIR, 'data')
PREPROC_FILEPATH = os.path.join(DATA_DIR, 'preprocessed_training_dataframe.pkl')
DATA_FILEPATH = os.path.join(DATA_DIR, 'metadata_articles_dataframe.pkl')

data = pd.read_pickle(DATA_FILEPATH)
#preproc = pd.read_pickle(PREPROC_FILEPATH)
#data = data[:1000]

def generate_feature_matrix(X):
    
    #X = create_dataframe_for_training(X)

    vectorizer = CountVectorizer(min_df=0.01)
    X_article_fe = vectorizer.fit_transform(X["article_content"])
    X_claim_fe = vectorizer.transform(X["claim"])
    return X_claim_fe.toarray(), X_article_fe.toarray()


# X_claim_fe, X_article_fe = generate_feature_matrix(data)

# print("X_claim_fe matrix shape: " + str(X_claim_fe.shape))
# print("X_article_fe matrix shape: " + str(X_article_fe.shape))

In [3]:
df = data.reset_index(drop=True)
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text. substitute the matched string in REPLACE_BY_SPACE_RE with space.
    text = BAD_SYMBOLS_RE.sub('', text) # remove symbols which are in BAD_SYMBOLS_RE from text. substitute the matched string in BAD_SYMBOLS_RE with nothing. 
    text = text.replace('x', '')
#    text = re.sub(r'\W+', '', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # remove stopwors from text
    return text

df['article_content'] = df['article_content'].apply(clean_text)
df['article_content'] = df['article_content'].str.replace('\d+', '')

df['claim'] = df['claim'].apply(clean_text)
df['claim'] = df['claim'].str.replace('\d+', '')

df.loc[df['claimant'] == "", "claimant"] = "unknown"
df["num_related_articles"] = df["related_articles"].apply(lambda x: len(x))
df['num_date'] = pd.to_numeric(df['date'].dt.strftime("%Y%m%d"))

In [5]:
# The maximum number of words to be used. (most frequent)
MAX_NB_WORDS = 10000
# Max N words in each complaint.
MAX_SEQUENCE_LENGTH = 1000

EMBEDDING_DIM = 100

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(df['article_content'].values)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 1131145 unique tokens.


In [6]:
X = tokenizer.texts_to_sequences(df['article_content'].values)
X_article = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)
X = tokenizer.texts_to_sequences(df['claim'].values)
X_claim = pad_sequences(X, maxlen=MAX_SEQUENCE_LENGTH)

# X_claim = np.array(X_claim_fe)
# X_article = np.array(X_article_fe)

X_num_articles = df["num_related_articles"].to_numpy().reshape(-1,1)
X_claimant = pd.get_dummies(df['claimant']).values
X_date = df["num_date"].to_numpy().reshape(-1,1)

X_numeric = np.concatenate((X_num_articles, X_claimant, X_date), axis=1)
scaler = StandardScaler()
X_numeric = scaler.fit_transform(X_numeric)

#X_final = np.concatenate((X_article, X_claim, X_num_articles, X_claimant, X_date), axis=1)
#print('Shape of data tensor:', X_final.shape)

In [7]:
Y = pd.get_dummies(df['label']).values
print('Shape of label tensor:', Y.shape)

Shape of label tensor: (15555, 3)


In [8]:
modeling_idx = np.where(np.logical_or(data["fold"] == "train", data["fold"] == "development"))
#dev_idx = np.where(data["fold"] == "development")
test_idx = np.where(data["fold"] == "test")

In [9]:
# define the sets of inputs
numeric_input = Input(shape=(X_numeric.shape[1],))
claim = Input(shape=(X_claim.shape[1],))
article = Input(shape=(X_article.shape[1],))
  
y = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_claim.shape[1])(claim)
z = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=X_article.shape[1])(article)

combined_bilstm = concatenate([y, z])
combined_bilstm = SpatialDropout1D(0.4)(combined_bilstm)
combined_bilstm = Bidirectional(LSTM(64))(combined_bilstm)
combined_bilstm = Dropout(0.5)(combined_bilstm)
combined_bilstm = Dense(9, activation='relu')(combined_bilstm)

numeric_feat = Dense(64, activation="relu")(numeric_input)
numeric_feat = Dense(3, activation="relu")(numeric_feat)

final_model = concatenate([numeric_feat, combined_bilstm])
final_model = Dense(10, activation='relu')(final_model)
final_model = Dense(3, activation='softmax')(final_model)

model = Model(inputs=[numeric_input, claim, article], outputs=final_model)

# balance target for better F1 score prediction

In [10]:
true_idx = np.where(np.logical_and(df['label']==0, np.logical_or(data["fold"] == "train", data["fold"] == "development")))
partly_true_idx = np.where(np.logical_and(df['label']==1, np.logical_or(data["fold"] == "train", data["fold"] == "development")))
false_idx = np.where(np.logical_and(df['label']==2, np.logical_or(data["fold"] == "train", data["fold"] == "development")))

In [11]:
rebal_modeling_idx = np.concatenate((true_idx[0][:len(false_idx[0])], partly_true_idx[0][:len(false_idx[0])], false_idx[0]), axis=0)

In [12]:
epochs = 2
batch_size = 256

# try using different optimizers and different optimizer configs
#adam = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, amsgrad=True)
model.compile(loss='categorical_crossentropy', optimizer="Nadam", metrics=['accuracy'])

print('Train on original dataset')
history = model.fit([X_numeric[modeling_idx], X_claim[modeling_idx], X_article[modeling_idx]], Y[modeling_idx], epochs=epochs, batch_size=batch_size,validation_data=([X_numeric[test_idx], X_claim[test_idx], X_article[test_idx]], Y[test_idx]) ,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])
#print('Train on rebalanced dataset')
#history2 = model.fit([X_numeric[rebal_modeling_idx], X_claim[rebal_modeling_idx], X_article[rebal_modeling_idx]], Y[rebal_modeling_idx], epochs=epochs, batch_size=batch_size,validation_data=([X_numeric[test_idx], X_claim[test_idx], X_article[test_idx]], Y[test_idx]) ,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Train on original dataset


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 13222 samples, validate on 2333 samples
Epoch 1/2
Epoch 2/2


In [16]:
train_rebalanced = True
if train_rebalanced:   
    epochs = 2
    batch_size = 256

    # try using different optimizers and different optimizer configs
    #adam = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, amsgrad=True)
    model.compile(loss='categorical_crossentropy', optimizer="Nadam", metrics=['accuracy'])

    print('Train on rebalanced dataset')
    history2 = model.fit([X_numeric[rebal_modeling_idx], X_claim[rebal_modeling_idx], X_article[rebal_modeling_idx]], Y[rebal_modeling_idx], epochs=epochs, batch_size=batch_size,validation_data=([X_numeric[test_idx], X_claim[test_idx], X_article[test_idx]], Y[test_idx]) ,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

Train on rebalanced dataset
Train on 4347 samples, validate on 2333 samples
Epoch 1/2
Epoch 2/2


In [None]:
import matplotlib.pyplot as plt

plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.xlabel("epoch")
plt.legend()
plt.show()

In [None]:
plt.title('Accuracy')
plt.plot(history.history['acc'], label='train')
plt.plot(history.history['val_acc'], label='test')
plt.xlabel("epoch")
plt.legend()
plt.show()

In [17]:
pred = model.predict([X_numeric[test_idx], X_claim[test_idx], X_article[test_idx]])

pred_class = np.argmax(pred, axis=1)
pd.DataFrame(pred_class).to_pickle(os.path.join(PROJECT_DIR, "predictions\\predictions_bilstm_final_rebal.pkl"))

In [18]:
pd.DataFrame(pred_class)[0].value_counts()

0    1683
1     574
2      76
Name: 0, dtype: int64