In [1]:
import pandas as pd
import numpy as np
import tensorflow

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RepeatedStratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

seed = 2000
np.random.seed(seed)
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import BatchNormalization, Dropout, Flatten, Dense, Embedding
from tensorflow.keras.layers import LSTM, Conv1D, MaxPooling1D
from tensorflow.keras.preprocessing import sequence

from gensim.models import Word2Vec
from gensim.parsing.porter import PorterStemmer

import nltk
from nltk.tokenize import TweetTokenizer

import warnings
warnings.filterwarnings("ignore")

2023-01-04 12:34:41.636516: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
train_sub1_df = pd.read_csv('/Users/nitanshjain/Documents/Miscellaneous/SemEval/semeval2023task3/preprocessed_data/subtask1/train_subtask_1.csv')
print(train_sub1_df.shape)
train_sub1_df.head()
print(train_sub1_df.genre.value_counts())

(433, 10)
opinion      382
reporting     41
satire        10
Name: genre, dtype: int64


In [3]:
le = LabelEncoder()
train_sub1_df['genre'] = le.fit_transform(train_sub1_df['genre'])
print(train_sub1_df.genre.value_counts())
train_sub1_df.head()

0    382
1     41
2     10
Name: genre, dtype: int64


Unnamed: 0,id,genre,headlines,articles,preprocessed_headlines,pos_tags_headlines,er_tags_headlines,preprocessed_articles,pos_tags_articles,er_tags_articles
0,833042063,2,Chelsea Handler Admits She’s ‘Very Sexually At...,Far-left comedienne Chelsea Handler has admitt...,chelsea handler admit she s very sexually attr...,"[(Chelsea, 'NNP'), (Handler, 'NNP'), (admit, '...","[(Chelsea Handler, 'PERSON', 380), (Robert Mue...",far leave comedienne chelsea handler have admi...,"[(far, 'RB'), (leave, 'VB'), (comedienne, 'NNP...","[(Chelsea Handler, 'PERSON', 380), (FBI Specia..."
1,832959523,2,How Theresa May Botched\n,Those were the times…\nThe Times page 1 is of ...,how theresa may botch,"[(how, 'WRB'), (Theresa, 'NNP'), (May, 'NNP'),...","[(Theresa, 'GPE', 384), (May, 'DATE', 391)]",those be the time the times page 1 be of janua...,"[(those, 'DT'), (be, 'VBP'), (the, 'DT'), (tim...","[(Times, 'ORG', 383), (1, 'CARDINAL', 397), (J..."
2,833039623,2,Robert Mueller III Rests His Case—Dems NEVER W...,Carload of crazies headed to the White House w...,robert mueller iii rest his case dems never will,"[(Robert, 'NNP'), (Mueller, 'NNP'), (III, 'NNP...","[(Robert Mueller III, 'PERSON', 380), (Dems, '...",carload of crazy head to the white house want ...,"[(Carload, 'NNP'), (of, 'IN'), (crazy, 'JJ'), ...","[(the White House, 'ORG', 383), (Barack Obama,..."
3,833032367,2,Robert Mueller Not Recommending Any More Indic...,"But of course, this makes no difference to the...",robert mueller not recommend any more indictment,"[(Robert, 'NNP'), (Mueller, 'NNP'), (not, 'RB'...","[(Robert Mueller, 'PERSON', 380)]",but of course this make no difference to the p...,"[(but, 'CC'), (of, 'IN'), (course, 'NN'), (thi...","[(the New York Times, 'ORG', 383), (late Frida..."
4,814777937,2,The Far Right Is Trying to Co-opt the Yellow V...,"This weekend in Manchester, England, Yellow Ve...",the far right be try to co opt the yellow vests,"[(the, 'DT'), (Far, 'NNP'), (right, 'NN'), (be...",[],this weekend in manchester england yellow vest...,"[(this, 'DT'), (weekend, 'NN'), (in, 'IN'), (M...","[(this weekend, 'DATE', 391), (Manchester, 'GP..."


In [4]:
def creating_tokens(df):
    tokens = list()
    tokenizer = TweetTokenizer()
    
    for tweets in df.loc[:, 'preprocessed_headlines']:
        # print(len(tokenizer.tokenize(tweets)))
        tokens.append(tokenizer.tokenize(tweets))
    
    df['tokens_headlines'] = tokens
    
    porter_stemmer = PorterStemmer()
    # Get the stemmed_tokens
    df['stemmed_tokens_headlines'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in df['tokens_headlines']]
    
    return df

In [5]:
train_sub1_df = creating_tokens(train_sub1_df)
x_tokens = train_sub1_df.stemmed_tokens_headlines
x_tokens = x_tokens.to_frame().reset_index()

y = train_sub1_df.genre

OUTPUT_FOLDER = '/Users/nitanshjain/Documents/Miscellaneous/SemEval/semeval2023task3/codefiles/subtask1/word2vec/skipgram/headlines/'

tokens = pd.Series(train_sub1_df.stemmed_tokens_headlines).values
# print(tokens)
word2vec_model_file = OUTPUT_FOLDER + 'word2vec_subtask1_' + str(200) + '.model'

w2v_model = Word2Vec(tokens, min_count=1, vector_size=200, window=5, workers=4, sg=2)
w2v_model.train(tokens, epochs=10, total_examples=len(tokens))
w2v_model.save(word2vec_model_file)

In [6]:
def create_file(create_file, model_file, x):
    sg_w2v_model = Word2Vec.load(model_file)
    
    with open(create_file, 'w+') as word2vec_file:
        for index, row in x.iterrows():
            model_vector = (np.mean([sg_w2v_model.wv[token] for token in row['stemmed_tokens_headlines']], axis=0)).tolist()
            if index == 0:
                header = ",".join(str(ele) for ele in range(200))
                word2vec_file.write(header)
                word2vec_file.write("\n")
            
            if type(model_vector) is list:
                line1 = ",".join( [str(vector_element) for vector_element in model_vector] )
            else:
                line1 = ",".join([str(0) for i in range(200)])
            word2vec_file.write(line1)
            word2vec_file.write('\n')
    
    df = pd.read_csv(create_file)
    return df
        

In [7]:
word2vec_train_filename = OUTPUT_FOLDER + 'word2vec_subtask1_train_' + str(200) + '.csv'
word2vec_train_df = create_file(word2vec_train_filename, word2vec_model_file, x_tokens)
print(word2vec_train_df.shape)
word2vec_train_df.head()

(433, 200)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.007295,-0.01874,-0.009705,0.04104,0.100159,-0.089804,0.047175,0.205242,-0.145307,0.062558,...,0.11526,-0.041857,-0.118474,-0.077384,0.095471,0.082259,0.038398,-0.112743,-0.014278,-0.028109
1,0.009126,-0.014582,-0.009387,0.02982,0.076703,-0.068332,0.034962,0.157911,-0.109272,0.046406,...,0.090417,-0.030808,-0.088585,-0.056323,0.071191,0.062683,0.028655,-0.086166,-0.012166,-0.01796
2,0.011011,-0.023,-0.011841,0.04798,0.11037,-0.105264,0.051632,0.232545,-0.164374,0.07059,...,0.131883,-0.049441,-0.133366,-0.092944,0.108925,0.093157,0.040405,-0.127402,-0.018062,-0.032354
3,0.011746,-0.023324,-0.012114,0.048869,0.110648,-0.101139,0.052997,0.234277,-0.161939,0.071568,...,0.129691,-0.047879,-0.134935,-0.092938,0.10696,0.093106,0.042178,-0.128909,-0.016888,-0.026392
4,0.008064,-0.016475,-0.011208,0.038,0.091253,-0.082179,0.043899,0.188566,-0.133965,0.057028,...,0.107118,-0.035528,-0.108976,-0.073549,0.089265,0.077006,0.035224,-0.104844,-0.015905,-0.023591


In [8]:
from sklearn.preprocessing import MinMaxScaler
mm = MinMaxScaler()

x_scaled = pd.DataFrame(mm.fit_transform(word2vec_train_df))
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=seed)
x_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,190,191,192,193,194,195,196,197,198,199
0,0.642934,0.370563,0.518074,0.636729,0.653978,0.298604,0.639727,0.694178,0.332021,0.684815,...,0.657444,0.316452,0.298153,0.295782,0.618413,0.688417,0.613756,0.295067,0.499121,0.334288
1,0.714476,0.557636,0.533225,0.410322,0.452471,0.527458,0.405271,0.479546,0.55801,0.423136,...,0.460378,0.553733,0.541473,0.54563,0.405894,0.470103,0.395571,0.51883,0.600845,0.627679
2,0.78815,0.178872,0.416575,0.776773,0.741701,0.133832,0.725283,0.81799,0.212444,0.814944,...,0.789305,0.153581,0.176924,0.111188,0.736178,0.809949,0.658707,0.171654,0.316878,0.21155
3,0.816877,0.164312,0.403606,0.794708,0.744083,0.177797,0.751474,0.825841,0.22772,0.830791,...,0.771914,0.187122,0.164148,0.111254,0.718977,0.809387,0.698431,0.158967,0.373437,0.383925
4,0.672975,0.472448,0.446642,0.575379,0.577472,0.379875,0.576838,0.618557,0.403149,0.595229,...,0.592862,0.452382,0.375473,0.341275,0.564092,0.629836,0.542673,0.361577,0.42078,0.464888


# KNN

In [9]:
from sklearn.neighbors import KNeighborsClassifier
knn_params = {
    'n_neighbors':range(1,15),
    'weights':['uniform', 'distance']
}

knn = KNeighborsClassifier()
clf = GridSearchCV(knn, knn_params, cv=cv)
clf.fit(x_scaled, y)

print('Best Accuracy Score ', clf.best_score_)
print('Best Parameters', clf.best_params_)

Best Accuracy Score  0.886804087385483
Best Parameters {'n_neighbors': 6, 'weights': 'uniform'}


# Logistic Regression

In [10]:
from sklearn.linear_model import LogisticRegression
lr_params = {
    'C':np.logspace(-2,2,7), 
    'penalty':['l1', 'l2', 'elasticnet', None]
}

lr = LogisticRegression()
clf = GridSearchCV(lr, lr_params, cv=cv)
clf.fit(x_scaled, y)

print('Best Accuracy Score ', clf.best_score_)
print('Best Parameters', clf.best_params_)

Best Accuracy Score  0.8822410147991546
Best Parameters {'C': 0.01, 'penalty': 'l2'}


# Decison Tree

In [11]:
from sklearn.tree import DecisionTreeClassifier

dt_params = {
    'max_depth':range(2,15),
    'criterion':['gini', 'entropy', 'log_loss'],
    'splitter':['best', 'random']
}

dt = DecisionTreeClassifier()
clf = GridSearchCV(dt, dt_params, cv=cv)
clf.fit(x_scaled, y)

print('Best Accuracy Score ', clf.best_score_)
print('Best Parameters', clf.best_params_)

Best Accuracy Score  0.8791754756871039
Best Parameters {'criterion': 'entropy', 'max_depth': 3, 'splitter': 'random'}


# Random Forest Classifier

In [12]:
from sklearn.ensemble import RandomForestClassifier

rfc_params = {
    'max_depth':range(3,15),
    'criterion':['gini', 'entropy', 'log_loss'],
}

rfc = RandomForestClassifier()
clf = GridSearchCV(rfc, rfc_params, cv=cv)
clf.fit(x_scaled, y)

print('Best Accuracy Score ', clf.best_score_)
print('Best Parameters', clf.best_params_)

Best Accuracy Score  0.8822410147991546
Best Parameters {'criterion': 'gini', 'max_depth': 3}


# Multinomial Naive Bayes

In [13]:
%%time
from sklearn.naive_bayes import MultinomialNB

mnb = MultinomialNB()
mnb.fit(x_scaled, y)

print('Train Accuracy Score', mnb.score(x_scaled, y))

Train Accuracy Score 0.8568129330254042
CPU times: user 13.6 ms, sys: 10.2 ms, total: 23.9 ms
Wall time: 22.9 ms


# SVM

In [14]:
from sklearn.svm import SVC

svc_params = {
    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
    'degree':range(2,5)
}

svc = SVC()
clf = GridSearchCV(svc, svc_params, cv=cv)
clf.fit(x_scaled, y)

print('Best Accuracy Score ', clf.best_score_)
print('Best Parameters', clf.best_params_)

Best Accuracy Score  0.8822410147991546
Best Parameters {'degree': 2, 'kernel': 'linear'}


# Ada-Boosting

In [15]:
from sklearn.ensemble import AdaBoostClassifier

ada = AdaBoostClassifier()
ada.fit(x_scaled, y)

print('Train Accuracy Score', ada.score(x_scaled, y))

Train Accuracy Score 0.766743648960739


# LSTM

In [None]:
x_scaled = x_scaled[:,:,None]

In [None]:
from keras import Sequential
from keras.layers import LSTM

model_lstm = Sequential()

model_lstm.add(LSTM(64, input_shape = x_scaled.shape[1:], return_sequences = True))
model_lstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_lstm.add(Dropout(rate=0.8))

model_lstm.add(LSTM(32, return_sequences=True))
model_lstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_lstm.add(Dropout(rate=0.5))

model_lstm.add(LSTM(16, return_sequences=True))
model_lstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_lstm.add(Dropout(rate=0.5))

model_lstm.add(LSTM(8, return_sequences=True))
model_lstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_lstm.add(Dropout(rate=0.5))

model_lstm.add(LSTM(1, return_sequences=True, name='output'))
model_lstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_lstm.add(Dropout(rate=0.5))

model_lstm.add(Dense(64, activation = 'relu'))
model_lstm.add(Flatten())
model_lstm.add(Dense(1, activation='softmax'))

model_lstm.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy'])

model_lstm.summary()

2022-12-27 14:13:43.797353: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 11720, 64)         16896     
                                                                 
 dropout (Dropout)           (None, 11720, 64)         0         
                                                                 
 lstm_1 (LSTM)               (None, 11720, 32)         12416     
                                                                 
 dropout_1 (Dropout)         (None, 11720, 32)         0         
                                                                 
 lstm_2 (LSTM)               (None, 11720, 16)         3136      
                                                                 
 dropout_2 (Dropout)         (None, 11720, 16)         0         
                                                                 
 lstm_3 (LSTM)               (None, 11720, 8)          8

In [None]:
batch_size = 32

model_lstm.fit(x_scaled, y,
            batch_size=batch_size,
            epochs=10,
            shuffle=True)

Epoch 1/5

# CNN LSTM

In [None]:
from keras import Sequential
from keras.layers import LSTM

model_clstm  =  Sequential()

model_clstm.add(Conv1D(32, (3), padding = 'same', activation = 'relu', input_shape = x_scaled.shape[1:]))
model_clstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_clstm.add(Dropout(rate = 0.8))

model_clstm.add(Conv1D(16, (3), padding = 'same', activation = 'relu', input_shape = x_scaled.shape[1:]))
model_clstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_clstm.add(Dropout(rate = 0.8))

model_clstm.add(Conv1D(8, (3), padding = 'same', activation = 'relu', input_shape = x_scaled.shape[1:]))
model_clstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_clstm.add(Dropout(rate = 0.8))

model_clstm.add(LSTM(32, return_sequences = True))
model_clstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_clstm.add(Dropout(rate = 0.5))

model_clstm.add(LSTM(16, return_sequences = True))
model_clstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_clstm.add(Dropout(rate = 0.5))

model_clstm.add(LSTM(1, return_sequences = True))
model_clstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_clstm.add(Dropout(rate = 0.5))

model_clstm.add(Dense(64, activation = 'relu'))
model_clstm.add(Flatten())
model_clstm.add(Dense(1, activation = 'softmax'))

model_clstm.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

model_clstm.summary()

In [None]:
batch_size = 32

model_clstm.fit(x_scaled, y,
            batch_size=batch_size,
            epochs=10,
            shuffle=True)

# Bi-LSTM

In [None]:
from keras import Sequential
from keras.layers import LSTM, Embedding, Bidirectional


model_bilstm  =  Sequential()

model_bilstm.add(Bidirectional(LSTM(32, input_shape = x_scaled.shape[1:], return_sequences=True)))
model_bilstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_bilstm.add(Dropout(rate = 0.5))

model_bilstm.add(Bidirectional(LSTM(16, return_sequences=True)))
model_bilstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_bilstm.add(Dropout(rate = 0.5))

model_bilstm.add(Bidirectional(LSTM(1, return_sequences=True)))
model_bilstm.add(MaxPooling1D(pool_size = (5), padding = 'same'))
model_bilstm.add(Dropout(rate = 0.5))

model_bilstm.add(Dense(64, activation = 'relu'))
model_bistm.add(Flatten())
model_bilstm.add(Dense(1, activation = 'softmax'))

model_bilstm.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

model_bilstm.summary()

In [None]:
batch_size = 32

model_bilstm.fit(x_scaled, y,
            batch_size=batch_size,
            epochs=10,
            shuffle=True)