In [1]:
import pandas as pd
import numpy as np
import torch
import spacy as sy
import torch.nn as nn
import tqdm
from collections import Counter

from gensim.models import Word2Vec
from gensim.models.fasttext import FastText

from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import *

import xgboost as xgb

seed = 2000
np.random.seed(seed)

import tensorflow as tf
import keras.backend as K
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import LSTM, MaxPooling1D, Dropout, Flatten, Dense, Bidirectional
from tensorflow.keras.preprocessing import sequence

import warnings
warnings.filterwarnings("ignore")

nlp_en = sy.load('en_core_web_sm')
all_stopwords = nlp_en.Defaults.stop_words

num_i = 4

2023-07-16 01:56:49.122223: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
train_df = pd.read_csv('/Users/nitanshjain/Documents/Projects/CASE/subtask_1/subtask_1_data/train_subtask1_test_preprocessed_{}.csv'.format(num_i))
print(train_df.shape)
print(train_df.head())

(3415, 3)
               index                                               text  label
0     train_01_0_892  the state alleged they hacked sabata petros ch...      1
1    train_01_1_2714  chale was allegedly chased group about thirty ...      0
2   train_01_10_2619  the farmworkers strike resumed tuesday when th...      1
3  train_01_100_2680  demonstrators have filed for permit hold rally...      1
4  train_01_101_3090  footage the attack which included pregnant wom...      1


In [4]:
test_df = pd.read_csv('/Users/nitanshjain/Documents/Projects/CASE/subtask_1/subtask_1_data/test_subtask1_test_preprocessed_{}.csv'.format(num_i))
test_df.head()


Unnamed: 0,index,text
0,test_01_A_0_271,more than twenty associate degree students mar...
1,test_01_A_1_215,more than ten people from the rights associati...
2,test_01_A_10_198,the organisation has been spearheading agitati...
3,test_01_A_100_277,not believe politicians that they will achieve...
4,test_01_A_101_152,for that matter haragopal himself took proacti...


In [5]:
def generate_pos_tags(text, min_threshold):
    doc = nlp_en(text)
    pos_tags = [(i.tag_) for i in doc]
    pos_tags = Counter(pos_tags)
    pos_tags = {x: count for x, count in pos_tags.items() if count > min_threshold}
    return pos_tags

def generate_tokens_prob_freq(tokens):
    dct={}
    for i in tokens:
        dct[i]=0
    for i in tokens:
        dct[i]+=1
    prob_freq = {key:float(value)/sum(dct.values()) for (key,value) in dct.items()}
    
    return prob_freq

train_df['pos_tags'] = train_df['text'].apply(lambda x: generate_pos_tags(x, 2))
test_df['pos_tags'] = train_df['text'].apply(lambda x : generate_pos_tags(x, 2))
train_df['pos_tags_prob'] = train_df['pos_tags'].apply(lambda x: generate_tokens_prob_freq(x))
test_df['pos_tags_prob'] = test_df['pos_tags'].apply(lambda x: generate_tokens_prob_freq(x))
train_df.head()

Unnamed: 0,index,text,label,pos_tags,pos_tags_prob
0,train_01_0_892,the state alleged they hacked sabata petros ch...,1,"{'NN': 6, 'NNP': 7, 'CD': 7}","{'NN': 0.3333333333333333, 'NNP': 0.3333333333..."
1,train_01_1_2714,chale was allegedly chased group about thirty ...,0,"{'NN': 4, 'NNS': 3}","{'NN': 0.5, 'NNS': 0.5}"
2,train_01_10_2619,the farmworkers strike resumed tuesday when th...,1,{},{}
3,train_01_100_2680,demonstrators have filed for permit hold rally...,1,"{'NNS': 8, 'NN': 6, 'NNP': 4, 'VBD': 3, 'CC': 3}","{'NNS': 0.2, 'NN': 0.2, 'NNP': 0.2, 'VBD': 0.2..."
4,train_01_101_3090,footage the attack which included pregnant wom...,1,"{'NN': 8, 'VBD': 3, 'JJ': 3, 'VBG': 7, 'NNS': ...","{'NN': 0.14285714285714285, 'VBD': 0.142857142..."


In [6]:
train_df_pos_prob = pd.json_normalize(train_df['pos_tags_prob'])
test_df_pos_prob = pd.json_normalize(test_df['pos_tags_prob'])
train_df_pos_prob.replace(np.nan, 0, inplace=True)
test_df_pos_prob.replace(np.nan, 0, inplace=True)
print(train_df_pos_prob.head())
print(test_df_pos_prob.head())

         NN       NNP        CD       NNS       VBD        CC        JJ  \
0  0.333333  0.333333  0.333333  0.000000  0.000000  0.000000  0.000000   
1  0.500000  0.000000  0.000000  0.500000  0.000000  0.000000  0.000000   
2  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
3  0.200000  0.200000  0.000000  0.200000  0.200000  0.200000  0.000000   
4  0.142857  0.000000  0.000000  0.142857  0.142857  0.142857  0.142857   

        VBG        IN   DT  ...   RB  VBP  VBN  VBZ  PRP$  HYPH  PRP   MD  \
0  0.000000  0.000000  0.0  ...  0.0  0.0  0.0  0.0   0.0   0.0  0.0  0.0   
1  0.000000  0.000000  0.0  ...  0.0  0.0  0.0  0.0   0.0   0.0  0.0  0.0   
2  0.000000  0.000000  0.0  ...  0.0  0.0  0.0  0.0   0.0   0.0  0.0  0.0   
3  0.000000  0.000000  0.0  ...  0.0  0.0  0.0  0.0   0.0   0.0  0.0  0.0   
4  0.142857  0.142857  0.0  ...  0.0  0.0  0.0  0.0   0.0   0.0  0.0  0.0   

   JJR  WRB  
0  0.0  0.0  
1  0.0  0.0  
2  0.0  0.0  
3  0.0  0.0  
4  0.0  0.0  

[

In [7]:
print(train_df_pos_prob.shape)
print(test_df_pos_prob.shape)
print(train_df_pos_prob.columns)
print(test_df_pos_prob.columns)

columns = list(set(train_df_pos_prob.columns) - set(test_df_pos_prob.columns))
print(columns)

for col_name in columns:
    if col_name not in train_df_pos_prob.columns:
        train_df_pos_prob[col_name]=0

    if col_name not in test_df_pos_prob.columns:
        test_df_pos_prob[col_name]=0
        
print(train_df_pos_prob.shape)
print(test_df_pos_prob.shape)

(3415, 22)
(352, 18)
Index(['NN', 'NNP', 'CD', 'NNS', 'VBD', 'CC', 'JJ', 'VBG', 'IN', 'DT', 'WDT',
       'VB', 'RB', 'VBP', 'VBN', 'VBZ', 'PRP$', 'HYPH', 'PRP', 'MD', 'JJR',
       'WRB'],
      dtype='object')
Index(['NN', 'NNP', 'CD', 'NNS', 'VBD', 'CC', 'JJ', 'VBG', 'IN', 'DT', 'WDT',
       'VB', 'RB', 'VBP', 'VBN', 'VBZ', 'PRP$', 'HYPH'],
      dtype='object')
['JJR', 'PRP', 'MD', 'WRB']
(3415, 22)
(352, 22)


In [8]:
scaler = MinMaxScaler()
x = train_df_pos_prob.values
x = scaler.fit_transform(x)
y = train_df['label'].values

x_test = test_df_pos_prob.values
# y_test = labels

# Calculating Classweights
class_weights = compute_class_weight(
    class_weight = "balanced",
    classes = np.unique(y),
    y = y
)
class_weights = dict(zip(np.unique(y), class_weights))

count_0 = np.unique(y, return_counts=True)[1][0]
count_1 = np.unique(y, return_counts=True)[1][1]
estimate = count_0/count_1

cv = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)

print(class_weights, estimate)

{0: 1.063200498132005, 1: 0.9438916528468767} 0.8877833056937534


In [11]:
xgb_model = xgb.XGBClassifier(scale_pos_weight=estimate)

parameters = {
            'objective':['binary:logistic'],
            'learning_rate': [0.1, 0.01, 0.001, 0.0001], 
            'max_depth': [5, 6, 7, 8],
            'n_estimators': [1000], #number of trees, change it to 1000 for better results
            'seed': [1337]
        }

clf = GridSearchCV(xgb_model, parameters, n_jobs=5, 
                   cv=cv, 
                   verbose=0, refit=True)

clf.fit(x, y)
print(clf.best_params_, clf.best_score_)

y_pred = clf.predict(x_test)

# index = pd.Series(list(range(0,340)))
index = pd.Series(test_df['index'])
prediction = pd.Series(y_pred.reshape(352))
submission_df = pd.concat([index,prediction], axis=1)
submission_df.rename(columns = {0:'prediction'}, inplace = True)
# submission_json = submission_df.to_json()

converted_data = []
for row in submission_df.itertuples(index=False):
    entry = {
        'index': row.index,
        'prediction': row.prediction
    }
    converted_data.append(entry)
converted_data

# # Syntax of json.dumps() function
import json
with open(f'/Users/nitanshjain/Documents/Projects/CASE/subtask_1/subtask_1_results/submission_pos_xgb_1.json', 'w') as fp:
    fp.write('\n'.join(json.dumps(i) for i in converted_data))

In [12]:
mnb = MultinomialNB()

parameters = {
            'fit_prior': [True, False],
            'class_prior': [None, [0.5, 0.5], [0.6, 0.4], [0.4, 0.6]]
        }

mnb_gsc = GridSearchCV(mnb, parameters, n_jobs=5, 
                   cv=cv, 
                   verbose=0, refit=True)

mnb_gsc.fit(x, y)
print(mnb_gsc.best_params_, mnb_gsc.best_score_)

y_pred = mnb_gsc.predict(x_test)

# index = pd.Series(list(range(0,340)))
index = pd.Series(test_df['index'])
prediction = pd.Series(y_pred.reshape(352))
submission_df = pd.concat([index,prediction], axis=1)
submission_df.rename(columns = {0:'prediction'}, inplace = True)
# submission_json = submission_df.to_json()

converted_data = []
for row in submission_df.itertuples(index=False):
    entry = {
        'index': row.index,
        'prediction': row.prediction
    }
    converted_data.append(entry)
converted_data

# # Syntax of json.dumps() function
import json
with open(f'/Users/nitanshjain/Documents/Projects/CASE/subtask_1/subtask_1_results/submission_pos_mnb_1.json', 'w') as fp:
    fp.write('\n'.join(json.dumps(i) for i in converted_data))

{'class_prior': None, 'fit_prior': False} 0.5827234652746811
