In [17]:
import pandas as pd
import numpy as np
import torch
import spacy as sy
import torch.nn as nn
import tqdm
from collections import Counter

from gensim.models import Word2Vec
from gensim.models.fasttext import FastText

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import *

import xgboost as xgb

seed = 2000
np.random.seed(seed)

import tensorflow as tf
import keras.backend as K
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import LSTM, MaxPooling1D, Dropout, Flatten, Dense, Bidirectional
from tensorflow.keras.preprocessing import sequence

import warnings
warnings.filterwarnings("ignore")

nlp_en = sy.load('en_core_web_sm')
all_stopwords = nlp_en.Defaults.stop_words

num_i = 2

In [18]:
train_df = pd.read_csv('/Users/nitanshjain/Documents/Projects/CASE/codefiles/subtask_1_data/train_subtask1_preprocessed_{}.csv'.format(num_i))
print(train_df.head())

               index                                               text  label
0     train_01_0_892  the state alleged they hacked sabata petros ch...      1
1    train_01_1_2714  chale was allegedly chased group about thirty ...      0
2   train_01_10_2619  the farmworkers strike resumed tuesday when th...      1
3  train_01_100_2680  demonstrators have filed for permit hold rally...      1
4  train_01_101_3090  footage the attack which included pregnant wom...      1


In [19]:
dev_df = pd.read_csv('/Users/nitanshjain/Documents/Projects/CASE/codefiles/subtask_1_data/dev_subtask1_preprocessed_{}.csv'.format(num_i))
dev_df.head()

dev_df_labels = pd.read_csv('/Users/nitanshjain/Documents/Projects/CASE/tanfiona CausalNewsCorpus master data-V2/dev_subtask1.csv')
labels = dev_df_labels['label'].values
del(dev_df_labels)
print(labels)
print(dev_df.head())

[1 1 0 1 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 0 1 0
 0 1 1 1 0 1 1 0 0 1 0 0 0 1 1 0 1 1 0 1 0 1 1 0 0 0 1 0 0 0 0 1 1 1 1 0 0
 1 0 0 1 1 0 1 1 0 1 0 1 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 1 0 1
 0 1 0 1 1 0 1 0 1 1 1 0 1 0 0 1 1 1 0 0 0 1 1 1 1 1 1 0 1 1 1 0 0 0 1 0 1
 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 0 0 1 1 0 0 1 0 1 0 0 0 0 1 1 1 0 0 0
 0 1 1 1 0 1 1 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 0
 0 1 1 1 0 0 0 1 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 1 1 0 1 0 0 1 1 1 0 0 0 1 1
 0 1 1 0 0 1 0 0 0 1 1 0 0 1 1 1 0 1 0 1 0 1 1 1 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 1 1 0 1 1 0 0 1 0 1 0 0 0 1 0 0 1 1 1 1 1 1 0 0 1 0 1 0 1 0 0 1 1 1 1 0
 0 1 0 1 1 1 0]
               index                                               text
0    train_10_0_2136  the movement was catapulted into the headlines...
1     train_10_1_350  several thousand protesters took the streets a...
2   train_10_10_3104  the protest not just about saving medha life b...
3  train_10_100_1188 

In [20]:
def generate_pos_tags(text, min_threshold):
    doc = nlp_en(text)
    pos_tags = [(i.tag_) for i in doc]
    pos_tags = Counter(pos_tags)
    pos_tags = {x: count for x, count in pos_tags.items() if count > min_threshold}
    return pos_tags

def generate_tokens_prob_freq(tokens):
    dct={}
    for i in tokens:
        dct[i]=0
    for i in tokens:
        dct[i]+=1
    prob_freq = {key:float(value)/sum(dct.values()) for (key,value) in dct.items()}
    
    return prob_freq

train_df['pos_tags'] = train_df['text'].apply(lambda x: generate_pos_tags(x, 2))
dev_df['pos_tags'] = train_df['text'].apply(lambda x : generate_pos_tags(x, 2))
train_df['pos_tags_prob'] = train_df['pos_tags'].apply(lambda x: generate_tokens_prob_freq(x))
dev_df['pos_tags_prob'] = dev_df['pos_tags'].apply(lambda x: generate_tokens_prob_freq(x))
train_df.head()

Unnamed: 0,index,text,label,pos_tags,pos_tags_prob
0,train_01_0_892,the state alleged they hacked sabata petros ch...,1,"{'NN': 6, 'NNP': 7, 'CD': 7}","{'NN': 0.3333333333333333, 'NNP': 0.3333333333..."
1,train_01_1_2714,chale was allegedly chased group about thirty ...,0,"{'NN': 4, 'NNS': 3}","{'NN': 0.5, 'NNS': 0.5}"
2,train_01_10_2619,the farmworkers strike resumed tuesday when th...,1,{},{}
3,train_01_100_2680,demonstrators have filed for permit hold rally...,1,"{'NNS': 8, 'NN': 6, 'NNP': 4, 'VBD': 3, 'CC': 3}","{'NNS': 0.2, 'NN': 0.2, 'NNP': 0.2, 'VBD': 0.2..."
4,train_01_101_3090,footage the attack which included pregnant wom...,1,"{'NN': 8, 'VBD': 3, 'JJ': 3, 'VBG': 7, 'NNS': ...","{'NN': 0.14285714285714285, 'VBD': 0.142857142..."


In [21]:
train_df_pos_prob = pd.json_normalize(train_df['pos_tags_prob'])
dev_df_pos_prob = pd.json_normalize(dev_df['pos_tags_prob'])
train_df_pos_prob.replace(np.nan, 0, inplace=True)
dev_df_pos_prob.replace(np.nan, 0, inplace=True)
print(train_df_pos_prob.head())
print(dev_df_pos_prob.head())

         NN       NNP        CD       NNS       VBD        CC        JJ  \
0  0.333333  0.333333  0.333333  0.000000  0.000000  0.000000  0.000000   
1  0.500000  0.000000  0.000000  0.500000  0.000000  0.000000  0.000000   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
3  0.200000  0.200000  0.000000  0.200000  0.200000  0.200000  0.000000   
4  0.142857  0.000000  0.000000  0.142857  0.142857  0.142857  0.142857   

        VBG        IN   DT  ...   VB   RB  VBP  VBN  VBZ  PRP$  HYPH  PRP  \
0  0.000000  0.000000  0.0  ...  0.0  0.0  0.0  0.0  0.0   0.0   0.0  0.0   
1  0.000000  0.000000  0.0  ...  0.0  0.0  0.0  0.0  0.0   0.0   0.0  0.0   
2  0.000000  0.000000  0.0  ...  0.0  0.0  0.0  0.0  0.0   0.0   0.0  0.0   
3  0.000000  0.000000  0.0  ...  0.0  0.0  0.0  0.0  0.0   0.0   0.0  0.0   
4  0.142857  0.142857  0.0  ...  0.0  0.0  0.0  0.0  0.0   0.0   0.0  0.0   

    MD  JJR  
0  0.0  0.0  
1  0.0  0.0  
2  0.0  0.0  
3  0.0  0.0  
4  0.0  0.0  

[

In [22]:
print(train_df_pos_prob.shape)
print(dev_df_pos_prob.shape)
print(train_df_pos_prob.columns)
print(dev_df_pos_prob.columns)

columns = list(set(train_df_pos_prob.columns) - set(dev_df_pos_prob.columns))
print(columns)

for col_name in columns:
    if col_name not in train_df_pos_prob.columns:
        train_df_pos_prob[col_name]=0

    if col_name not in dev_df_pos_prob.columns:
        dev_df_pos_prob[col_name]=0
        
print(train_df_pos_prob.shape)
print(dev_df_pos_prob.shape)

(3075, 21)
(340, 18)
Index(['NN', 'NNP', 'CD', 'NNS', 'VBD', 'CC', 'JJ', 'VBG', 'IN', 'DT', 'WDT',
       'VB', 'RB', 'VBP', 'VBN', 'VBZ', 'PRP$', 'HYPH', 'PRP', 'MD', 'JJR'],
      dtype='object')
Index(['NN', 'NNP', 'CD', 'NNS', 'VBD', 'CC', 'JJ', 'VBG', 'IN', 'DT', 'WDT',
       'VB', 'RB', 'VBP', 'VBN', 'VBZ', 'PRP$', 'HYPH'],
      dtype='object')
['PRP', 'JJR', 'MD']
(3075, 21)
(340, 21)


In [38]:
scaler = MinMaxScaler()
x = train_df_pos_prob.values
x = scaler.fit_transform(x)
y = train_df['label'].values

x_dev = dev_df_pos_prob.values
y_dev = labels

# Calculating Classweights
class_weights = compute_class_weight(
    class_weight = "balanced",
    classes = np.unique(y),
    y = y
)
class_weights = dict(zip(np.unique(y), class_weights))

count_0 = np.unique(y, return_counts=True)[1][0]
count_1 = np.unique(y, return_counts=True)[1][1]
estimate = count_0/count_1

cv = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)

print(class_weights, estimate)

{0: 1.0596140592694694, 1: 0.9467364532019704} 0.8934729064039408


In [39]:
xgb_model = xgb.XGBClassifier(scale_pos_weight=estimate)

parameters = {
            'objective':['binary:logistic'],
            'learning_rate': [0.1, 0.01, 0.001, 0.0001], 
            'max_depth': [5, 6, 7, 8],
            'n_estimators': [1000], #number of trees, change it to 1000 for better results
            'seed': [1337]
        }

clf = GridSearchCV(xgb_model, parameters, n_jobs=5, 
                   cv=cv, 
                   verbose=0, refit=True)

clf.fit(x, y)
print(clf.best_params_, clf.best_score_)

y_pred = clf.predict(x_dev)
print(classification_report(y_dev, y_pred))
print(f1_score(y_dev, y_pred))
print(recall_score(y_dev, y_pred))
print(precision_score(y_dev, y_pred))
print(matthews_corrcoef(y_dev, y_pred))

{'learning_rate': 0.001, 'max_depth': 5, 'n_estimators': 1000, 'objective': 'binary:logistic', 'seed': 1337} 0.6081300813008129
              precision    recall  f1-score   support

           0       0.44      0.51      0.47       155
           1       0.52      0.45      0.48       185

    accuracy                           0.48       340
   macro avg       0.48      0.48      0.48       340
weighted avg       0.48      0.48      0.48       340

0.48255813953488375
0.4486486486486487
0.5220125786163522
-0.041598564339664246


In [40]:
mnb = MultinomialNB()

parameters = {
            'fit_prior': [True, False],
            'class_prior': [None, [0.5, 0.5], [0.6, 0.4], [0.4, 0.6]]
        }

mnb_gsc = GridSearchCV(mnb, parameters, n_jobs=5, 
                   cv=cv, 
                   verbose=0, refit=True)

mnb_gsc.fit(x, y)
print(mnb_gsc.best_params_, mnb_gsc.best_score_)

y_pred = mnb_gsc.predict(x_dev)
print(classification_report(y_dev, y_pred))
print(f1_score(y_dev, y_pred))
print(recall_score(y_dev, y_pred))
print(precision_score(y_dev, y_pred))
print(matthews_corrcoef(y_dev, y_pred))

{'class_prior': None, 'fit_prior': False} 0.5899186991869918
              precision    recall  f1-score   support

           0       0.45      0.52      0.48       155
           1       0.53      0.45      0.49       185

    accuracy                           0.49       340
   macro avg       0.49      0.49      0.49       340
weighted avg       0.49      0.49      0.49       340

0.489795918367347
0.4540540540540541
0.5316455696202531
-0.023332369848381523


In [26]:
def binary_f1(y_true, y_pred):
    
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2 * (precision * recall)/(precision + recall + K.epsilon())
    
    return f1_val

x = x[:,:,None]

red_lr = tf.keras.callbacks.ReduceLROnPlateau(
            monitor="val_loss", 
            factor=0.6,
            patience=2, 
            min_lr=0.0001,
            verbose=1)



In [27]:
model_lstm = Sequential()

model_lstm.add(LSTM(64, input_shape = x.shape[1:], return_sequences = True))
model_lstm.add(LSTM(64))

model_lstm.add(Dense(64, activation = 'relu'))
model_lstm.add(Flatten())
model_lstm.add(Dense(1, activation='sigmoid'))

model_lstm.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics = [binary_f1])

model_lstm.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_4 (LSTM)               (None, 21, 64)            16896     
                                                                 
 lstm_5 (LSTM)               (None, 64)                33024     
                                                                 
 dense_4 (Dense)             (None, 64)                4160      
                                                                 
 flatten_2 (Flatten)         (None, 64)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 65        
                                                                 
Total params: 54,145
Trainable params: 54,145
Non-trainable params: 0
_________________________________________________________________


In [28]:
batch_size = 128

model_lstm.fit(x, y,
            batch_size=batch_size,
            epochs=20,
            validation_data=(x_dev, y_dev),
            class_weight=class_weights,
            shuffle=True, 
            callbacks=[red_lr])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 3: ReduceLROnPlateau reducing learning rate to 0.0006000000284984708.
Epoch 4/20
Epoch 5/20
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.0003600000170990825.
Epoch 6/20
Epoch 7/20
Epoch 7: ReduceLROnPlateau reducing learning rate to 0.00021600000327453016.
Epoch 8/20
Epoch 9/20
Epoch 9: ReduceLROnPlateau reducing learning rate to 0.00012960000021848827.
Epoch 10/20
Epoch 11/20
Epoch 11: ReduceLROnPlateau reducing learning rate to 0.0001.
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x169a4baf0>

In [33]:
y_pred = model_lstm.predict(x_dev)
y_pred_final = np.where(y_pred > 0.5, 1, 0)
print(classification_report(y_dev, y_pred_final))
print(f1_score(y_dev, y_pred_final))
print(recall_score(y_dev, y_pred_final))
print(precision_score(y_dev, y_pred_final))
print(matthews_corrcoef(y_dev, y_pred_final))

              precision    recall  f1-score   support

           0       0.53      0.13      0.21       155
           1       0.55      0.90      0.69       185

    accuracy                           0.55       340
   macro avg       0.54      0.52      0.45       340
weighted avg       0.54      0.55      0.47       340

0.6858316221765914
0.9027027027027027
0.5529801324503312
0.05016425989944509


In [30]:
model_bilstm  =  Sequential()

model_bilstm.add(Bidirectional(LSTM(64, input_shape = x.shape[1:], return_sequences=True)))
model_bilstm.add(Bidirectional(LSTM(64)))

model_bilstm.add(Dense(64, activation = 'relu'))
model_bilstm.add(Flatten())
model_bilstm.add(Dense(1, activation = 'sigmoid'))

model_bilstm.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss='binary_crossentropy', metrics = [binary_f1])
model_bilstm.build(input_shape=x.shape)
model_bilstm.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_2 (Bidirectio  (3075, 21, 128)          33792     
 nal)                                                            
                                                                 
 bidirectional_3 (Bidirectio  (3075, 128)              98816     
 nal)                                                            
                                                                 
 dense_6 (Dense)             (3075, 64)                8256      
                                                                 
 flatten_3 (Flatten)         (3075, 64)                0         
                                                                 
 dense_7 (Dense)             (3075, 1)                 65        
                                                                 
Total params: 140,929
Trainable params: 140,929
Non-tr

In [31]:
batch_size = 128

model_bilstm.fit(x, y,
            batch_size=batch_size,
            epochs=20,
            validation_data=(x_dev, y_dev),
            class_weight=class_weights,
            shuffle=True,
            callbacks=[red_lr])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0006000000284984708.
Epoch 5/20
Epoch 6/20
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.0003600000170990825.
Epoch 7/20
Epoch 8/20
Epoch 8: ReduceLROnPlateau reducing learning rate to 0.00021600000327453016.
Epoch 9/20
Epoch 10/20
Epoch 10: ReduceLROnPlateau reducing learning rate to 0.00012960000021848827.
Epoch 11/20
Epoch 12/20
Epoch 12: ReduceLROnPlateau reducing learning rate to 0.0001.
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x16273fee0>

In [35]:
y_pred = model_bilstm.predict(x_dev)
y_pred_final = np.where(y_pred > 0.5, 1, 0)
print(classification_report(y_dev, y_pred_final))
print(f1_score(y_dev, y_pred_final))
print(recall_score(y_dev, y_pred_final))
print(precision_score(y_dev, y_pred_final))
print(matthews_corrcoef(y_dev, y_pred_final))

              precision    recall  f1-score   support

           0       0.48      0.40      0.44       155
           1       0.56      0.64      0.60       185

    accuracy                           0.53       340
   macro avg       0.52      0.52      0.52       340
weighted avg       0.52      0.53      0.52       340

0.595959595959596
0.6378378378378379
0.5592417061611374
0.03883666618024915
