In [None]:
import csv

from tqdm.notebook import tqdm
from typing import Dict, List, Tuple
from random import choice
from urllib.request import urlopen


In [None]:
def read_pp_examples(file_url: str) -> List[Dict]:
  """Reads the pp samples from a remtoe url and loads them into a dictionary

  Args:
      file_url (str): a url to load the dataset from

  Returns:
      Dict: a dictionary with two keys: answer and pp
  """
  pp_examples = []

  for line in tqdm(urlopen(file_url)):
    line = line.decode("utf-8").strip().split()
    assert(len(line) == 5)
    v,n1,p,n2,answer = line
    pp_examples.append( {'answer':answer,'keywords':(v,n1,p,n2)} )
  return pp_examples

In [None]:
pp_samples_url = 'https://raw.githubusercontent.com/liadmagen/NLP-Course/master/dataset/pp_examples.txt'

In [None]:
pp_examples = read_pp_examples(pp_samples_url)


0it [00:00, ?it/s]

In [None]:
print(f"There are {len(pp_examples)} samples in the dataset")

There are 25858 samples in the dataset


In [None]:
print(choice(pp_examples))
print(choice(pp_examples))
print(choice(pp_examples))
print(choice(pp_examples))
print(choice(pp_examples))
print(choice(pp_examples))
print(choice(pp_examples))
print(choice(pp_examples))
print(choice(pp_examples))

{'answer': 'N', 'keywords': ('opened', 'store', 'of', 'own')}
{'answer': 'V', 'keywords': ('posted', 'increase', 'despite', 'decline')}
{'answer': 'N', 'keywords': ('started', 'number', 'of', 'businesses')}
{'answer': 'N', 'keywords': ('offers', 'concessions', 'by', 'KKR')}
{'answer': 'V', 'keywords': ('were', 'sharecroppers', 'in', 'Arkansas')}
{'answer': 'V', 'keywords': ('increase', 'speed', 'of', 'work')}
{'answer': 'V', 'keywords': ('assimilate', 'status', 'as', 'project')}
{'answer': 'V', 'keywords': ('fell', 'yesterday', 'to', '$')}
{'answer': 'V', 'keywords': ('store', 'kids', 'during', 'day')}


In [None]:
random_example = choice(pp_examples)
random_example['keywords']

('opened', 'mind', 'to', 'politics')

In [None]:
random_example['answer']

'V'

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

import re

import nltk
from nltk.corpus import brown
nltk.download('brown')
from nltk import ngrams

!pip install gensim
from gensim.models import Word2Vec
import gensim
import warnings
warnings.filterwarnings(action='ignore')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!




In [None]:
def scores(y_test, y_pred):
  precision = precision_score(y_test, y_pred, pos_label="V")
  recall = recall_score(y_test, y_pred, pos_label="V")
  accuracy = accuracy_score(y_test, y_pred)

  print('Precision: {} / Recall: {} / Accuracy: {}'.format(round(precision, 3), round(recall, 3), round(accuracy, 3)))

In [None]:
data = pp_examples

X = []
y = []

for instance in data:
    keywords_str = ' '.join(instance['keywords'])
    X.append(keywords_str)
    y.append(instance['answer'])

SVC classifier without features

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = CountVectorizer(ngram_range=(1, 4))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

classifier = SVC(kernel='linear')
classifier.fit(X_train_vec, y_train)

y_pred = classifier.predict(X_test_vec)

scores(y_test, y_pred)

Precision: 0.811 / Recall: 0.841 / Accuracy: 0.838


Naive Bayes without features

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

vectorizer = CountVectorizer(ngram_range=(1, 4))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)

y_pred = classifier.predict(X_test_vec)

scores(y_test, y_pred)

Precision: 0.821 / Recall: 0.787 / Accuracy: 0.827


Naive Bayes with corpus features

In [None]:
v_unigram = [instance['keywords'][0] for instance in data]
p_unigram = [instance['keywords'][2] for instance in data]
n1_unigram = [instance['keywords'][1] for instance in data]
n2_unigram = [instance['keywords'][3] for instance in data]

vp_bigram = [' '.join([instance['keywords'][0], instance['keywords'][2]]) for instance in data]
n1p_bigram = [' '.join(instance['keywords'][1:3]) for instance in data]
pn2_bigram = [' '.join(instance['keywords'][2:]) for instance in data]

vn1p_trigram = [' '.join(instance['keywords'][:3]) for instance in data]
vpn2_trigram = [' '.join([instance['keywords'][0], instance['keywords'][2], instance['keywords'][3]]) for instance in data]
n1pn2_trigram = [' '.join(instance['keywords'][1:]) for instance in data]

vnpn2_4gram = [' '.join(instance['keywords']) for instance in data]

big_corpus = ' '.join(brown.words())

In [None]:
vp_in_corpus = [str(len(re.findall(bigram, big_corpus))) for bigram in vp_bigram]

In [None]:
n1p_in_corpus = [str(len(re.findall(bigram, big_corpus))) for bigram in n1p_bigram]

In [None]:
n1pn2_in_corpus = [str(len(re.findall(trigram, big_corpus))) for trigram in n1pn2_trigram]

In [None]:
vpn2_in_corpus = [str(len(re.findall(trigram, big_corpus))) for trigram in vpn2_trigram]

In [None]:
X_combined = [
    ', '.join(features) for features in zip(
    v_unigram, n1_unigram, p_unigram, n2_unigram,
    vp_bigram, n1p_bigram, pn2_bigram,
    vn1p_trigram, vpn2_trigram, n1pn2_trigram,
    vnpn2_4gram
    )
]
X_combined_with_corpus_features = [
    ', '.join(features) for features in zip(
        X_combined, vp_in_corpus, n1p_in_corpus, n1pn2_in_corpus, vpn2_in_corpus
    )
]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_combined_with_corpus_features, y, test_size=0.2, random_state=43)

vectorizer = CountVectorizer(ngram_range=(1, 4))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)

y_pred = classifier.predict(X_test_vec)

scores(y_test, y_pred)

Precision: 0.784 / Recall: 0.828 / Accuracy: 0.82


Naive Bayes with cosine similarity features

In [None]:
sentences = [sentence.split() for sentence in X]
model = gensim.models.Word2Vec(sentences, min_count=1, vector_size=100, window=5)

In [None]:
def cos(string):
  verb, noun1, prep, noun2 = string.split()
  vp = model.wv.similarity(verb, prep)
  vn1 = model.wv.similarity(verb, noun1)
  vn2 = model.wv.similarity(verb, noun2)

  n1p = model.wv.similarity(noun1, prep)
  n1n2 = model.wv.similarity(noun1, noun2)

  pn2 = model.wv.similarity(prep, noun2)
  return ", ".join(str(i) for i in[string, vp, vn1, vn2, n1p, n1n2, pn2])

X_features = [cos(x) for x in X]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)

In [None]:
vectorizer = CountVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

classifier = MultinomialNB()
classifier.fit(X_train_vec, y_train)

y_pred = classifier.predict(X_test_vec)

scores(y_test, y_pred)

Precision: 0.831 / Recall: 0.794 / Accuracy: 0.832


Decision Tree without features

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = CountVectorizer(ngram_range=(1, 4))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

classifier = DecisionTreeClassifier()
classifier.fit(X_train_vec, y_train)

y_pred = classifier.predict(X_test_vec)

scores(y_test, y_pred)

Precision: 0.79 / Recall: 0.831 / Accuracy: 0.822
