Skip to content
This repository has been archived by the owner on Nov 16, 2023. It is now read-only.

Commit

Permalink
Changes for code review comments.
Browse files Browse the repository at this point in the history
1. Added compute_correlation_coeff method to utils and separated it from
the predict method to ensure single responsbility.
2. Added tests accordingly.
3. In the notebook added a scrap to track preditions and assert it in
tests.
4. Also added extra documentation to explain what the predict method is
doing.
5. Minor fix to stop train at max_epoch.
  • Loading branch information
AbhiramE committed Jul 25, 2019
1 parent ddf810f commit c6ab656
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 49 deletions.
123 changes: 84 additions & 39 deletions scenarios/sentence_similarity/gensen_local.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 11,
"metadata": {
"pycharm": {
"name": "#%%\n"
Expand All @@ -84,16 +84,16 @@
"from utils_nlp.dataset.preprocess import to_lowercase, to_nltk_tokens\n",
"from utils_nlp.dataset import snli, preprocess\n",
"from scenarios.sentence_similarity.gensen_wrapper import GenSenClassifier\n",
"from utils_nlp.models.pretrained_embeddings.glove import download_and_extract \n",
"from utils_nlp.models.pretrained_embeddings.glove import download_and_extract\n",
"import scrapbook as sb\n",
"\n",
"\n",
"print(\"System version: {}\".format(sys.version))\n",
"BASE_DATA_PATH = '../../data'"
"print(\"System version: {}\".format(sys.version))"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {
"tags": [
"parameters"
Expand All @@ -102,7 +102,9 @@
"outputs": [],
"source": [
"max_epoch = None\n",
"config_filepath = 'gensen_config.json'"
"config_filepath = 'gensen_config.json'\n",
"base_data_path = '../../data'\n",
"nrows = None"
]
},
{
Expand Down Expand Up @@ -142,7 +144,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 3,
"metadata": {
"pycharm": {
"name": "#%%\n"
Expand Down Expand Up @@ -327,15 +329,15 @@
"4 2267923837.jpg#2r1e entailment NaN NaN NaN NaN "
]
},
"execution_count": 4,
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"train\")\n",
"dev = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"dev\")\n",
"test = snli.load_pandas_df(BASE_DATA_PATH, file_split=\"test\")\n",
"train = snli.load_pandas_df(base_data_path, file_split=\"train\", nrows=nrows)\n",
"dev = snli.load_pandas_df(base_data_path, file_split=\"dev\", nrows=nrows)\n",
"test = snli.load_pandas_df(base_data_path, file_split=\"test\", nrows=nrows)\n",
"\n",
"train.head()"
]
Expand All @@ -351,7 +353,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 4,
"metadata": {
"pycharm": {
"name": "#%%\n"
Expand Down Expand Up @@ -380,7 +382,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 5,
"metadata": {
"pycharm": {
"name": "#%%\n"
Expand Down Expand Up @@ -490,7 +492,7 @@
"4 [two, kids, at, a, ballgame, wash, their, hand... "
]
},
"execution_count": 6,
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -527,7 +529,7 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 14,
"metadata": {
"pycharm": {
"name": "#%%\n"
Expand All @@ -543,7 +545,7 @@
}
],
"source": [
"pretrained_embedding_path = download_and_extract(BASE_DATA_PATH)"
"pretrained_embedding_path = download_and_extract(base_data_path)"
]
},
{
Expand All @@ -555,27 +557,18 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 15,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"outputs": [],
"source": [
"clf = GenSenClassifier(config_file = config_filepath, \n",
" pretrained_embedding_path = pretrained_embedding_path,\n",
" learning_rate = 0.0001, \n",
" cache_dir=BASE_DATA_PATH,\n",
" cache_dir=base_data_path,\n",
" max_epoch=max_epoch)"
]
},
Expand All @@ -588,13 +581,46 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 8,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/data/anaconda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/modules/rnn.py:46: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.8 and num_layers=1\n",
" \"num_layers={}\".format(dropout, num_layers))\n",
"../../scenarios/sentence_similarity/gensen_train.py:431: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_.\n",
" torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)\n",
"../../utils_nlp/models/gensen/utils.py:364: UserWarning: volatile was removed and now has no effect. Use `with torch.no_grad():` instead.\n",
" Variable(torch.LongTensor(sorted_src_lens), volatile=True)\n",
"/data/anaconda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/functional.py:1332: UserWarning: nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.\n",
" warnings.warn(\"nn.functional.sigmoid is deprecated. Use torch.sigmoid instead.\")\n",
"/data/anaconda/envs/nlp_gpu/lib/python3.6/site-packages/torch/nn/functional.py:1320: UserWarning: nn.functional.tanh is deprecated. Use torch.tanh instead.\n",
" warnings.warn(\"nn.functional.tanh is deprecated. Use torch.tanh instead.\")\n",
"../../scenarios/sentence_similarity/gensen_train.py:523: UserWarning: torch.nn.utils.clip_grad_norm is now deprecated in favor of torch.nn.utils.clip_grad_norm_.\n",
" torch.nn.utils.clip_grad_norm(model.parameters(), 1.0)\n",
"/data/anaconda/envs/nlp_gpu/lib/python3.6/site-packages/horovod/torch/__init__.py:163: UserWarning: optimizer.step(synchronize=True) called after optimizer.synchronize(). This can cause training slowdown. You may want to consider using optimizer.step(synchronize=False) if you use optimizer.synchronize() in your code.\n",
" warnings.warn(\"optimizer.step(synchronize=True) called after \"\n",
"../../scenarios/sentence_similarity/gensen_train.py:243: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
" f.softmax(class_logits).data.cpu().numpy().argmax(axis=-1)\n",
"../../scenarios/sentence_similarity/gensen_train.py:262: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
" f.softmax(class_logits).data.cpu().numpy().argmax(axis=-1)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 1h 19min 28s, sys: 22min 1s, total: 1h 41min 30s\n",
"Wall time: 1h 41min 22s\n"
]
}
],
"source": [
"%%time\n",
"clf.fit(train, dev, test)"
Expand All @@ -604,27 +630,43 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"### 2.3 Predict"
"### 2.3 Predict\n",
"\n",
"In the predict method we perform Pearson's Correlation computation [\\[2\\]](#References) on the outputs of the model. The predictions of the model can be further improved by hyperparameter tuning which we walk through in the other example [here](gensen_aml_deep_dive.ipynb). "
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"******** Similarity Score for sentences **************\n",
" 0 1\n",
"0 1.000000 0.966793\n",
"1 0.966793 1.000000\n"
]
}
],
"source": [
"sentences = [\n",
" 'the quick brown fox jumped over the lazy dog',\n",
" 'bright sunshiny day tomorrow.'\n",
" 'The sky is blue and beautiful',\n",
" 'Love this blue and beautiful sky!'\n",
" ]\n",
"\n",
"results = clf.predict(sentences)\n",
"print(\"******** Similarity Score for sentences **************\")\n",
"print(results)"
"print(results)\n",
"\n",
"# Record results with scrapbook for tests\n",
"sb.glue(\"results\", results.to_dict())"
]
},
{
Expand All @@ -634,16 +676,19 @@
"## References\n",
"\n",
"1. Subramanian, Sandeep and Trischler, Adam and Bengio, Yoshua and Pal, Christopher J, [*Learning general purpose distributed sentence representations via large scale multi-task learning*](https://arxiv.org/abs/1804.00079), ICLR, 2018.\n",
"3. Semantic textual similarity. url: http://nlpprogress.com/english/semantic_textual_similarity.html"
"2. Pearson's Correlation Coefficient. url: https://en.wikipedia.org/wiki/Pearson_correlation_coefficient\n",
"3. Semantic textual similarity. url: http://nlpprogress.com/english/semantic_textual_similarity.html\n",
"4. Minh-Thang Luong, Quoc V Le, Ilya Sutskever, Oriol Vinyals, and Lukasz Kaiser. [*Multi-task sequence to sequence learning*](https://arxiv.org/abs/1511.06114), 2015.\n",
"5. Bryan McCann, James Bradbury, Caiming Xiong, and Richard Socher. [*Learned in translation: Contextualized word vectors](https://arxiv.org/abs/1708.00107), 2017. "
]
}
],
"metadata": {
"celltoolbar": "Tags",
"kernelspec": {
"display_name": "Python [conda env:nlp_gpu]",
"display_name": "Python (nlp_gpu)",
"language": "python",
"name": "conda-env-nlp_gpu-py"
"name": "nlp_gpu"
},
"language_info": {
"codemirror_mode": {
Expand Down
2 changes: 1 addition & 1 deletion scenarios/sentence_similarity/gensen_train.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def evaluate(
)
if (monitor_epoch - min_val_loss_epoch) > config["training"][
"stop_patience"
] or (max_epoch is not None and monitor_epoch > max_epoch):
] or (max_epoch is not None and monitor_epoch >= max_epoch):
logging.info("Saving model ...")
# Save the name with validation loss.
torch.save(
Expand Down
18 changes: 9 additions & 9 deletions scenarios/sentence_similarity/gensen_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@
import json
import os

import numpy as np
import pandas as pd

from scenarios.sentence_similarity.gensen_train import train
from utils_nlp.models.gensen.create_gensen_model import create_multiseq2seq_model
from utils_nlp.eval.classification import compute_correlation_coefficients
from utils_nlp.models.gensen.create_gensen_model import (
create_multiseq2seq_model,
)
from utils_nlp.models.gensen.gensen import GenSenSingle
from utils_nlp.models.gensen.preprocess_utils import gensen_preprocess

Expand Down Expand Up @@ -135,13 +135,13 @@ def predict(self, sentences):
sentences(list) : List of sentences.
Returns
array: A pairwise cosine similarity for the sentences provided based on their gensen
vector representations.
pd.Dataframe: A pairwise cosine similarity for the sentences provided based on their
gensen vector representations.
"""

# self.cache_dir = os.path.join(self.cache_dir, "clean/snli_1.0")
self._create_multiseq2seq_model()
# self._create_multiseq2seq_model()

gensen_model = GenSenSingle(
model_folder=os.path.join(
Expand All @@ -152,7 +152,7 @@ def predict(self, sentences):
)

reps_h, reps_h_t = gensen_model.get_representation(
sentences, pool="last", return_numpy=True
sentences, pool="last", return_numpy=True, tokenize=True
)

return pd.DataFrame(np.corrcoef(reps_h_t))
return compute_correlation_coefficients(reps_h_t)
9 changes: 9 additions & 0 deletions tests/integration/test_notebooks_sentence_similarity.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

sys.path.append("../../")
ABS_TOL = 0.2
ABS_TOL_PEARSONS = 0.05


@pytest.fixture(scope="module")
Expand Down Expand Up @@ -96,5 +97,13 @@ def test_gensen_local(notebooks):
parameters=dict(
max_epoch=1,
config_filepath="../../scenarios/sentence_similarity/gensen_config.json",
base_data_path="../../data",
),
)

results = sb.read_notebook(OUTPUT_NOTEBOOK).scraps.data_dict["results"]
expected = {"0": {"0": 1, "1": 0.95}, "1": {"0": 0.95, "1": 1}}

for key, value in expected.items():
for k, v in value.items():
assert results[key][k] == pytest.approx(v, abs=ABS_TOL_PEARSONS)
16 changes: 16 additions & 0 deletions tests/unit/test_eval_classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import numpy as np

from utils_nlp.eval.classification import compute_correlation_coefficients


def test_compute():
x = np.random.rand(2, 100)
df = compute_correlation_coefficients(x)
assert df.shape == (2, 2)

y = np.random.rand(2, 100)
df = compute_correlation_coefficients(x, y)
assert df.shape == (4, 4)
23 changes: 23 additions & 0 deletions utils_nlp/eval/classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@
f1_score,
)

from numpy import corrcoef
import pandas as pd


def eval_classification(actual, predicted, round_decimals=4):
"""Returns common classification evaluation metrics.
Expand All @@ -32,3 +35,23 @@ def eval_classification(actual, predicted, round_decimals=4):
f1_score(actual, predicted, average=None).round(round_decimals)
),
}


def compute_correlation_coefficients(x, y=None):
"""
Compute Pearson product-moment correlation coefficients.
Args:
x: array_like
A 1-D or 2-D array containing multiple variables and observations.
Each row of `x` represents a variable, and each column a single
observation of all those variables.
y: array_like, optional
An additional set of variables and observations. `y` has the same
shape as `x`.
Returns:
pd.DataFrame : A pandas dataframe from the correlation coefficient matrix of the variables.
"""
return pd.DataFrame(corrcoef(x, y))

0 comments on commit c6ab656

Please sign in to comment.