In [1]:
import pandas as pd
import numpy as np
import nltk
import tensorflow

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

seed = 2000
np.random.seed(seed)
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import BatchNormalization, Dropout, Flatten, Dense, Embedding
from tensorflow.keras.layers import LSTM, Conv1D, MaxPooling1D
from tensorflow.keras.preprocessing import sequence

import warnings
warnings.filterwarnings("ignore")

2023-01-07 12:40:01.777107: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
train_sub1_df = pd.read_csv('/Users/nitanshjain/Documents/Miscellaneous/SemEval/semeval2023task3/preprocessed_data/subtask1/train_subtask_1.csv')
print(train_sub1_df.shape)
train_sub1_df.head()
print(train_sub1_df.genre.value_counts())

(433, 10)
opinion      382
reporting     41
satire        10
Name: genre, dtype: int64


In [3]:
le = LabelEncoder()
train_sub1_df['genre'] = le.fit_transform(train_sub1_df['genre'])
print(train_sub1_df.genre.value_counts())
train_sub1_df.head()

0    382
1     41
2     10
Name: genre, dtype: int64


Unnamed: 0,id,genre,headlines,articles,preprocessed_headlines,pos_tags_headlines,er_tags_headlines,preprocessed_articles,pos_tags_articles,er_tags_articles
0,833042063,2,Chelsea Handler Admits She’s ‘Very Sexually At...,Far-left comedienne Chelsea Handler has admitt...,chelsea handler admit she s very sexually attr...,"[(Chelsea, 'NNP'), (Handler, 'NNP'), (admit, '...","[(Chelsea Handler, 'PERSON', 380), (Robert Mue...",far leave comedienne chelsea handler have admi...,"[(far, 'RB'), (leave, 'VB'), (comedienne, 'NNP...","[(Chelsea Handler, 'PERSON', 380), (FBI Specia..."
1,832959523,2,How Theresa May Botched\n,Those were the times…\nThe Times page 1 is of ...,how theresa may botch,"[(how, 'WRB'), (Theresa, 'NNP'), (May, 'NNP'),...","[(Theresa, 'GPE', 384), (May, 'DATE', 391)]",those be the time the times page 1 be of janua...,"[(those, 'DT'), (be, 'VBP'), (the, 'DT'), (tim...","[(Times, 'ORG', 383), (1, 'CARDINAL', 397), (J..."
2,833039623,2,Robert Mueller III Rests His Case—Dems NEVER W...,Carload of crazies headed to the White House w...,robert mueller iii rest his case dems never will,"[(Robert, 'NNP'), (Mueller, 'NNP'), (III, 'NNP...","[(Robert Mueller III, 'PERSON', 380), (Dems, '...",carload of crazy head to the white house want ...,"[(Carload, 'NNP'), (of, 'IN'), (crazy, 'JJ'), ...","[(the White House, 'ORG', 383), (Barack Obama,..."
3,833032367,2,Robert Mueller Not Recommending Any More Indic...,"But of course, this makes no difference to the...",robert mueller not recommend any more indictment,"[(Robert, 'NNP'), (Mueller, 'NNP'), (not, 'RB'...","[(Robert Mueller, 'PERSON', 380)]",but of course this make no difference to the p...,"[(but, 'CC'), (of, 'IN'), (course, 'NN'), (thi...","[(the New York Times, 'ORG', 383), (late Frida..."
4,814777937,2,The Far Right Is Trying to Co-opt the Yellow V...,"This weekend in Manchester, England, Yellow Ve...",the far right be try to co opt the yellow vests,"[(the, 'DT'), (Far, 'NNP'), (right, 'NN'), (be...",[],this weekend in manchester england yellow vest...,"[(this, 'DT'), (weekend, 'NN'), (in, 'IN'), (M...","[(this weekend, 'DATE', 391), (Manchester, 'GP..."


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer 

x = train_sub1_df.preprocessed_articles + train_sub1_df.preprocessed_headlines
y = train_sub1_df.genre

tfidf_vectorizer = TfidfVectorizer(use_idf = True) 
tfidf_vectorizer.fit(x)

x_tfidf = tfidf_vectorizer.transform(x).toarray()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=seed)

# KNN

In [5]:
from sklearn.neighbors import KNeighborsClassifier
knn_params = {
    'n_neighbors':range(1,15),
    'weights':['uniform', 'distance']
}

knn = KNeighborsClassifier()
clf = GridSearchCV(knn, knn_params, cv=cv)
clf.fit(x_tfidf, y)

print('Best Accuracy Score ', clf.best_score_)
print('Best Parameters', clf.best_params_)

Best Accuracy Score  0.8922128259337563
Best Parameters {'n_neighbors': 6, 'weights': 'uniform'}


# Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression
lr_params = {
    'C':np.logspace(-2,2,7), 
    'penalty':['l1', 'l2', 'elasticnet', None]
}

lr = LogisticRegression()
clf = GridSearchCV(lr, lr_params, cv=cv)
clf.fit(x_tfidf, y)

print('Best Accuracy Score ', clf.best_score_)
print('Best Parameters', clf.best_params_)

Best Accuracy Score  0.887632135306554
Best Parameters {'C': 4.6415888336127775, 'penalty': 'l2'}


# Decison Tree

In [7]:
from sklearn.tree import DecisionTreeClassifier

dt_params = {
    'max_depth':range(2,15),
    'criterion':['gini', 'entropy', 'log_loss'],
    'splitter':['best', 'random']
}

dt = DecisionTreeClassifier()
clf = GridSearchCV(dt, dt_params, cv=cv)
clf.fit(x_tfidf, y)

print('Best Accuracy Score ', clf.best_score_)
print('Best Parameters', clf.best_params_)

Best Accuracy Score  0.8845313601127557
Best Parameters {'criterion': 'gini', 'max_depth': 2, 'splitter': 'random'}


# Random Forest Classifier

In [8]:
from sklearn.ensemble import RandomForestClassifier

rfc_params = {
    'max_depth':range(3,15),
    'criterion':['gini', 'entropy', 'log_loss'],
}

rfc = RandomForestClassifier()
clf = GridSearchCV(rfc, rfc_params, cv=cv)
clf.fit(x_tfidf, y)

print('Best Accuracy Score ', clf.best_score_)
print('Best Parameters', clf.best_params_)

Best Accuracy Score  0.8822410147991546
Best Parameters {'criterion': 'gini', 'max_depth': 3}


# Multinomial Naive Bayes

In [9]:
%%time
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(x_tfidf, y)

print('Train Accuracy Score', mnb.score(x_tfidf, y))

Train Accuracy Score 0.8822170900692841
CPU times: user 78.8 ms, sys: 25.6 ms, total: 104 ms
Wall time: 49.2 ms


# SVM

In [10]:
from sklearn.svm import SVC

svc_params = {
    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
    'degree':range(2,5)
}

svc = SVC()
clf = GridSearchCV(svc, svc_params, cv=cv)
clf.fit(x_tfidf, y)

print('Best Accuracy Score ', clf.best_score_)
print('Best Parameters', clf.best_params_)

Best Accuracy Score  0.8853241719520792
Best Parameters {'degree': 2, 'kernel': 'poly'}


# Ada-Boosting

In [11]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier()
ada.fit(x_tfidf, y)

print('Train Accuracy Score', ada.score(x_tfidf, y))

Train Accuracy Score 0.8822170900692841


# LSTM

In [None]:
x_tfidf = x_tfidf[:,:,None]

In [32]:
from keras import Sequential
from keras.layers import LSTM

model_lstm = Sequential()

model_lstm.add(LSTM(64, input_shape = x_tfidf.shape[1:], return_sequences = True))
model_lstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_lstm.add(Dropout(rate=0.8))

model_lstm.add(LSTM(32, return_sequences=True))
model_lstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_lstm.add(Dropout(rate=0.5))

model_lstm.add(LSTM(16, return_sequences=True))
model_lstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_lstm.add(Dropout(rate=0.5))

model_lstm.add(LSTM(8, return_sequences=True))
model_lstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_lstm.add(Dropout(rate=0.5))

model_lstm.add(LSTM(1, return_sequences=True, name='output'))
model_lstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_lstm.add(Dropout(rate=0.5))

model_lstm.add(Dense(64, activation = 'relu'))
model_lstm.add(Flatten())
model_lstm.add(Dense(1, activation='softmax'))

model_lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy'])

model_lstm.summary()

2022-12-27 14:13:43.797353: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 11720, 64)         16896     
                                                                 
 dropout (Dropout)           (None, 11720, 64)         0         
                                                                 
 lstm_1 (LSTM)               (None, 11720, 32)         12416     
                                                                 
 dropout_1 (Dropout)         (None, 11720, 32)         0         
                                                                 
 lstm_2 (LSTM)               (None, 11720, 16)         3136      
                                                                 
 dropout_2 (Dropout)         (None, 11720, 16)         0         
                                                                 
 lstm_3 (LSTM)               (None, 11720, 8)          8

In [33]:
batch_size = 32

model_lstm.fit(x_tfidf, y,
            batch_size=batch_size,
            epochs=10,
            shuffle=True)

Epoch 1/5

# CNN LSTM

In [None]:
from keras import Sequential
from keras.layers import LSTM

model_clstm  =  Sequential()

model_clstm.add(Conv1D(32, (3), padding = 'same', activation = 'relu', input_shape = x_tfidf.shape[1:]))
model_clstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_clstm.add(Dropout(rate = 0.8))

model_clstm.add(Conv1D(16, (3), padding = 'same', activation = 'relu', input_shape = x_tfidf.shape[1:]))
model_clstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_clstm.add(Dropout(rate = 0.8))

model_clstm.add(Conv1D(8, (3), padding = 'same', activation = 'relu', input_shape = x_tfidf.shape[1:]))
model_clstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_clstm.add(Dropout(rate = 0.8))

model_clstm.add(LSTM(32, return_sequences = True))
model_clstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_clstm.add(Dropout(rate = 0.5))

model_clstm.add(LSTM(16, return_sequences = True))
model_clstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_clstm.add(Dropout(rate = 0.5))

model_clstm.add(LSTM(1, return_sequences = True))
model_clstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_clstm.add(Dropout(rate = 0.5))

model_clstm.add(Dense(64, activation = 'relu'))
model_clstm.add(Flatten())
model_clstm.add(Dense(1, activation = 'softmax'))

model_clstm.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

model_clstm.summary()

In [None]:
batch_size = 32

model_clstm.fit(x_tfidf, y,
            batch_size=batch_size,
            epochs=10,
            shuffle=True)

# Bi-LSTM

In [None]:

from keras import Sequential
from keras.layers import LSTM, Embedding, Bidirectional


model_bilstm  =  Sequential()

model_bilstm.add(Bidirectional(LSTM(32, input_shape = x_tfidf.shape[1:], return_sequences=True)))
model_bilstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_bilstm.add(Dropout(rate = 0.5))

model_bilstm.add(Bidirectional(LSTM(16, return_sequences=True)))
model_bilstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_bilstm.add(Dropout(rate = 0.5))

model_bilstm.add(Bidirectional(LSTM(1, return_sequences=True)))
model_bilstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_bilstm.add(Dropout(rate = 0.5))

model_bilstm.add(Dense(64, activation = 'relu'))
model_bilstm.add(Flatten())
model_bilstm.add(Dense(1, activation = 'softmax'))

model_bilstm.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

model_bilstm.summary()

In [None]:
batch_size = 32

model_bilstm.fit(x_tfidf, y,
            batch_size=batch_size,
            epochs=10,
            shuffle=True)