In [78]:
import nltk, re, torch
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
import goslate
import glob
import pandas as pd
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np
from sklearn.naive_bayes import MultinomialNB

In [79]:
# Function to check if a value is a number
def is_number(s):
    try:
        float(s)
        return True
    except ValueError:
        return False

# Iterate over each PART
for PART in ['dev', 'test', 'train']:
    # Get a list of files for the current PART
    files = glob.glob(f'./{PART}_*.csv', recursive=True)
    
    # List to store filtered rows
    filtered_rows = []
    
    # Iterate over each file for the current PART
    for file in files:
        df = pd.read_csv(file)
        
        # Iterate over each row of the DataFrame
        for index, row in df.iterrows():
            row = row[1:4]
            
            # Check if the last column contains a number
            if is_number(row.iloc[-1]):
                # Append the row to the list
                filtered_rows.append(row)
    
    # Create a new DataFrame from the list of filtered rows
    new_df = pd.DataFrame(filtered_rows)
    
    # Determine the output filename
    output_filename = f'{PART}.csv'
    
    # Store the new DataFrame to a separate filtered CSV file
    new_df.to_csv(output_filename, index=False)
    
    # Print the shape of the new DataFrame
    print(f"Filtered data for {PART} stored to {output_filename} with shape:", new_df.shape)


Filtered data for dev stored to dev.csv with shape: (2000, 3)
Filtered data for test stored to test.csv with shape: (2000, 3)
Filtered data for train stored to train.csv with shape: (49184, 3)


In [80]:
train = pd.read_csv("train.csv")
train['label'] = train['label'].fillna(0).astype(int)
train = train.dropna()
train.head()


Unnamed: 0,sentence1,sentence2,label
0,"In Paris , in October 1560 , he secretly met t...","In October 1560 , he secretly met with the Eng...",0
1,The NBA season of 1975 -- 76 was the 30th seas...,The 1975 -- 76 season of the National Basketba...,1
2,"There are also specific discussions , public p...","There are also public discussions , profile sp...",0
3,When comparable rates of flow can be maintaine...,The results are high when comparable flow rate...,1
4,It is the seat of Zerendi District in Akmola R...,It is the seat of the district of Zerendi in A...,1


In [81]:
test = pd.read_csv("test.csv")
test.head()

Unnamed: 0,sentence1,sentence2,label
0,The exception was between late 2005 and 2009 w...,"The exception was between late 2005 and 2009 ,...",1
1,The Tabaci River is a tributary of the River L...,The Leurda River is a tributary of the River T...,0
2,He played with the A-level Kane County Cougars...,He played in 1993 with the A - Level Portland ...,0
3,"Winarsky is a member of the IEEE , Phi Beta Ka...","Winarsky is a member of ACM , the IEEE , the P...",1
4,In 1938 he became the government anthropologis...,In 1938 he became the Government Anthropologis...,0


In [82]:
dev = pd.read_csv("dev.csv")
dev.head()

Unnamed: 0,sentence1,sentence2,label
0,From the merger of the Four Rivers Council and...,Shawnee Trails Council was formed from the mer...,1
1,Kathy and her husband Pete Beale ( Peter Dean ...,Kathy and her husband Peter Dean ( Pete Beale ...,1
2,Timora diarhoda is a species of moth of the No...,Diarhoda is a kind of moth of the Noctuidae fa...,1
3,Joe R. Campa Jr. is a former sailor of the Uni...,Joe R. Campa Jr. is a former U.S. Navy Matrose...,1
4,"Cook Pond , also known as the South Watuppa Po...","Cook Pond , also formerly known as Laurel Lake...",0


In [83]:
train.dtypes

sentence1    object
sentence2    object
label         int64
dtype: object

In [84]:
nltk.download('punkt')
nltk.download('stopwords')
stop = stopwords.words('english')
snow = SnowballStemmer('english')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/niravjivani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/niravjivani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [85]:
def process_text(texts): 
    final_text_list=[]
    #gs = goslate.Goslate()    
    for sent in texts:
        
        # Check if the sentence is a missing value
        if isinstance(sent, str) == False:
            sent = ''

        # translate any language to English
        # sent = gs.translate(sent, 'en')
        filtered_sentence=[]
        sent = sent.lower() # Lowercase 
        sent = sent.strip() # Remove leading/trailing whitespace
        sent = re.sub('\s+', ' ', sent) # Remove extra space and tabs
        sent = re.compile('<.*?>').sub('', sent) # Remove HTML tags/markups:
        for w in word_tokenize(sent):
            # Applying some custom filtering here, feel free to try different things
            # Check if it is not numeric and its length>2 and not in stopwords
            if(not w.isnumeric()) and (len(w)>2) and (w not in stop):  
                # Stem and add to filtered list
                filtered_sentence.append(snow.stem(w))
        final_string = " ".join(filtered_sentence) # Final string of cleaned words
 
        final_text_list.append(final_string)
    return final_text_list

In [86]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(train[['sentence1', 'sentence2']],
                                                  train['label'],
                                                  test_size=0.10,
                                                  shuffle=True,
                                                  random_state=324
                                                 )

Processing the sentence1 fields
Processing the sentence2 fields


In [88]:
text_features = ['sentence1', 'sentence2']

model_features = text_features
model_target = 'label'

In [89]:
X_test=test
X_train['sentence1'] = process_text(X_train['sentence1'].tolist())
X_val['sentence1'] = process_text(X_val['sentence1'].tolist())
X_test['sentence1'] = process_text(test['sentence1'].tolist())

X_train['sentence2'] = process_text(X_train['sentence2'].tolist())
X_val['sentence2'] = process_text(X_val['sentence2'].tolist())
X_test['sentence2'] = process_text(test['sentence2'].tolist())
### COLUMN_TRANSFORMER ###
##########################
text_processor_0 = Pipeline([
    ('text_vect_0', CountVectorizer(binary=True, max_features=100))
                                ])
text_precessor_1 = Pipeline([
    ('text_vect_1', CountVectorizer(binary=True, max_features=100))
                                ])
data_preprocessor = ColumnTransformer([
    ('text_pre_0', text_processor_0, text_features[0]),
    ('text_pre_1', text_precessor_1, text_features[1])
                                    ]) 
### DATA PREPROCESSING ###
##########################
print('Datasets shapes before processing: ', X_train.shape, X_val.shape, X_test.shape)
X_train = data_preprocessor.fit_transform(X_train).toarray()
X_val = data_preprocessor.transform(X_val).toarray()
X_test = data_preprocessor.transform(X_test).toarray()
print('Datasets shapes after processing: ', X_train.shape, X_val.shape, X_test.shape)

Datasets shapes before processing:  (44265, 2) (4919, 2) (2000, 3)
Datasets shapes after processing:  (44265, 200) (4919, 200) (2000, 200)


In [71]:
print(X_train[0])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0]


In [58]:
# Get the indices of NaN values in y_train
nan_indices = np.argwhere(np.isnan(y_train)).flatten()

# Remove corresponding rows from X_train
X_train = np.delete(X_train, nan_indices, axis=0)
# Remove NaN values from y_train
y_train = np.delete(y_train, nan_indices)
# Train the model
model = LogisticRegression()
model.fit(X_train, y_train)

val_predictions = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, val_predictions))
test_predictions = model.predict(X_test)

Validation Accuracy: 0.5556007318560683


In [57]:
from sklearn.naive_bayes import MultinomialNB

# Define Multinomial Naive Bayes model
nlp_model = MultinomialNB()

# Train the model
nlp_model.fit(X_train, y_train)

# Evaluate on validation set
val_predictions = nlp_model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, val_predictions))

# Predict on test set
test_predictions = nlp_model.predict(X_test)

Validation Accuracy: 0.5436064240699329
