## Training train set

In [1]:
import pandas as pd 
import numpy as np
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

import re

In [2]:
train_sentences = pd.read_csv('train_with_label.txt', sep = '\t', \
                              names = ['instance_id', 'sentence1', 'sentence2', 'label'], \
                             quoting = 3)

train_df = pd.DataFrame(train_sentences)
# train_df = train_df.replace(np.nan,0) 

In [3]:
def get_data(df):
    sent1 = df.loc[:, 'sentence1']
    sent2 = df.loc[:, 'sentence2']
    labels = df.loc[:, 'label']
    
    return sent1, sent2, labels

train_sent1, train_sent2, train_labels = get_data(train_df)

def get_data_test(df):
    sent1 = df.loc[:, 'sentence1']
    sent2 = df.loc[:, 'sentence2']
    testid = df.loc[:, 'instance_id']
    return sent1, sent2, testid

In [4]:
tokall1 = []
tokall2 = []

for i in range(len(train_df)):
    rx =  r"['()\w]+|\."
    tok1 = []
    tok2 = []
    
    s1 = train_sent1[i]
    s2 = train_sent2[i]
    
    tok1 += re.findall(rx, s1)
    tok2 += re.findall(rx, s2)
    
    tokall1.append([t.lower() for t in tok1])
    tokall2.append([t.lower() for t in tok2])

In [5]:
toktotal1 = [item for s in tokall1 for item in s]
toktotal2 = [item for s in tokall2 for item in s]

tokevery = toktotal1 + toktotal2

In [6]:
vectorizer = CountVectorizer(stop_words = 'english')

In [7]:
model = vectorizer.fit_transform(tokevery)

In [8]:
vecall1 = tokall1
vecall2 = tokall2
for i in range(len(tokall1)):
    vecall1[i] = vectorizer.transform(tokall1[i]).toarray().mean(axis = 0)
    
for i in range(len(tokall2)):
    vecall2[i] = vectorizer.transform(tokall2[i]).toarray().mean(axis = 0)

In [9]:
train_labels = list(train_labels)

In [10]:
log_model = LogisticRegression(penalty = 'l2', solver = 'newton-cg', \
                               class_weight = 'balanced', max_iter = 500)

# diffs = np.array(vecall1) + np.array(vecall2[i])

diffs = np.concatenate((np.array(vecall1), np.array(vecall2)), axis = 1)

In [11]:
log_model.fit(diffs, train_labels)
#svm_model.fit(diffs, train_labels)

LogisticRegression(class_weight='balanced', max_iter=500, solver='newton-cg')

In [12]:
log_model.score(diffs, train_labels)

0.7956831003188619

In [13]:
preds = log_model.predict(diffs)

# for i in range(len(sim_mat)):
#     if sim_mat[i][i] >= 0.5:
#         preds[i] = 1
        
print(accuracy_score(train_labels, preds))

0.7956831003188619


## Testing dev set on training set

In [14]:
dev_sentences = pd.read_csv('dev_with_label.txt', sep = '\t', \
                            names = ['instance_id', 'sentence1', 'sentence2', 'label'],\
                           quoting = 3)

dev_df = pd.DataFrame(dev_sentences)
# dev_df = dev_df.replace(np.nan,0) 

dev_sent1, dev_sent2, dev_labels = get_data(dev_df)

In [15]:
dev_tokall1 = []
dev_tokall2 = []

for i in range(len(dev_df)):
    rx =  r"['()\w]+|\."
    tok1 = []
    tok2 = []
    
    s1 = dev_sent1[i]
    s2 = dev_sent2[i]
    
    tok1 += re.findall(rx, s1)
    tok2 += re.findall(rx, s2)
    
    dev_tokall1.append([t.lower() for t in tok1])
    dev_tokall2.append([t.lower() for t in tok2])

In [16]:
dev_vecall1 = dev_tokall1
dev_vecall2 = dev_tokall2
for i in range(len(dev_tokall1)):
    dev_vecall1[i] = vectorizer.transform(dev_tokall1[i]).toarray().mean(axis = 0)
    
for i in range(len(dev_tokall2)):
    dev_vecall2[i] = vectorizer.transform(dev_tokall2[i]).toarray().mean(axis = 0)

In [17]:
dev_labels = list(dev_labels)

for i in range(len(dev_labels)):
    try:
        dev_labels[i] = int(dev_labels[i])
    except:
        dev_labels[i] = 0

# dev_diffs = np.array(dev_vecall1) + np.array(dev_vecall2)

dev_diffs = np.concatenate((np.array(dev_vecall1), np.array(dev_vecall2)),\
                           axis = 1)

dev_preds = log_model.predict(dev_diffs)

print(accuracy_score(dev_labels, dev_preds))

0.606353591160221


## Testing test set on dev set

In [18]:
dev_sentences = pd.read_csv('dev_with_label.txt', sep = '\t', \
                              names = ['instance_id', 'sentence1', 'sentence2', 'label'], \
                             quoting = 3)

dev_df = pd.DataFrame(train_sentences)

dev_sent1, dev_sent2, dev_labels = get_data(dev_df)

In [19]:
tokall1 = []
tokall2 = []

for i in range(len(dev_df)):
    rx =  r"['()\w]+|\."
    tok1 = []
    tok2 = []
    
    s1 = dev_sent1[i]
    s2 = dev_sent2[i]
    
    tok1 += re.findall(rx, s1)
    tok2 += re.findall(rx, s2)
    
    tokall1.append([t.lower() for t in tok1])
    tokall2.append([t.lower() for t in tok2])

In [20]:
toktotal1 = [item for s in tokall1 for item in s]
toktotal2 = [item for s in tokall2 for item in s]

tokevery = toktotal1 + toktotal2

In [21]:
vectorizer = CountVectorizer(stop_words = 'english')

In [22]:
model = vectorizer.fit_transform(tokevery)

In [23]:
vecall1 = tokall1
vecall2 = tokall2
for i in range(len(tokall1)):
    vecall1[i] = vectorizer.transform(tokall1[i]).toarray().mean(axis = 0)
    
for i in range(len(tokall2)):
    vecall2[i] = vectorizer.transform(tokall2[i]).toarray().mean(axis = 0)

In [24]:
dev_labels = list(dev_labels)

In [25]:
log_model2 = LogisticRegression(penalty = 'l2', solver = 'newton-cg', \
                               class_weight = 'balanced', max_iter = 500)

# diffs = np.array(vecall1) + np.array(vecall2[i])

diffs = np.concatenate((np.array(vecall1), np.array(vecall2)), axis = 1)

In [26]:
log_model2.fit(diffs, dev_labels)

LogisticRegression(class_weight='balanced', max_iter=500, solver='newton-cg')

In [27]:
test_sentences = pd.read_csv('test_without_label.txt', sep = '\t', \
                            names = ['instance_id', 'sentence1', 'sentence2'],
                           quoting = 3)

test_df = pd.DataFrame(test_sentences)
# dev_df = dev_df.replace(np.nan,0) 

test_sent1, test_sent2, test_id = get_data_test(test_df)

In [28]:
test_tokall1 = []
test_tokall2 = []

for i in range(len(test_df)):
    rx =  r"['()\w]+|\."
    tok1 = []
    tok2 = []
    
    s1 = test_sent1[i]
    s2 = test_sent2[i]
    
    tok1 += re.findall(rx, s1)
    tok2 += re.findall(rx, s2)
    
    test_tokall1.append([t.lower() for t in tok1])
    test_tokall2.append([t.lower() for t in tok2])

In [29]:
test_vecall1 = test_tokall1
test_vecall2 = test_tokall2
for i in range(len(test_tokall1)):
    test_vecall1[i] = vectorizer.transform(test_tokall1[i]).toarray().mean(axis = 0)
    
for i in range(len(test_tokall2)):
    test_vecall2[i] = vectorizer.transform(test_tokall2[i]).toarray().mean(axis = 0)
    
test_id = list(test_id)

In [30]:
def print_predictions(predictions, test_id):
    with open("MaryClay_test_result.txt", "a") as f:
        print("instance_id\tpredicted_label\n", file=f)
        for i in range(0, len(predictions)):
            print(predictions[i],"\t", test_id[i], file=f)

In [31]:
test_diffs = np.concatenate((np.array(test_vecall1), np.array(test_vecall2)),
                           axis = 1)

preds = log_model2.predict(test_diffs)
print_predictions(preds, test_id)