# MNLI

## Download original data: [site](https://www.nyu.edu/projects/bowman/multinli/)

## and store the original files in the folder "raw_mnli"

## Transforming data into csv

In [1]:
import pandas as pd

train_path = "raw_mnli/multinli_1.0_train.txt"
test_m_path = "raw_mnli/multinli_1.0_dev_matched.txt"
test_mm_path = "raw_mnli/multinli_1.0_dev_mismatched.txt"

def mnli2csv(path):
    data = []
    with open(path, "r") as f:
        for i in f.readlines():
            a = i.split("\t")
            data.append((a[5], a[6], a[0]))
        columns = data[0]
        data = data[1:]
        assert columns == ('sentence1', 'sentence2', 'gold_label')
        columns = ("premise", "hypothesis", "label")
    return pd.DataFrame(data, columns=columns)

In [2]:
df_train = mnli2csv(train_path)
print(df_train.shape)
df_train.head(2)

(392702, 3)


Unnamed: 0,premise,hypothesis,label
0,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...,neutral
1,you know during the season and i guess at at y...,You lose the things to the following level if ...,entailment


In [3]:
df_test_m = mnli2csv(test_m_path)
print(df_test_m.shape)
df_test_m.head(2)

(10000, 3)


Unnamed: 0,premise,hypothesis,label
0,The new rights are nice enough,Everyone really likes the newest benefits,neutral
1,This site includes a list of all award winners...,The Government Executive articles housed on th...,contradiction


In [4]:
df_test_mm = mnli2csv(test_mm_path)
print(df_test_mm.shape)
df_test_mm.head(2)

(10000, 3)


Unnamed: 0,premise,hypothesis,label
0,Your contribution helped make it possible for ...,Your contributions were of no help with our st...,contradiction
1,"The answer has nothing to do with their cause,...",Dictionaries are indeed exercises in bi-unique...,contradiction


## Creating train/dev/dev_plus datasets

In [5]:
train_in_sample = df_train.sample(50000,random_state=123)
train_in_sample_indexs = train_in_sample.index.values
safe1 = [i for i in df_train.index.values if i not in train_in_sample_indexs]
train_not_in_sample = df_train.loc[safe1].sample(10000,random_state=122)
train_not_in_sample_indexs = train_not_in_sample.index.values
safe2 = [i for i in safe1 if i not in train_not_in_sample_indexs]
dev = df_train.loc[safe2].sample(10000,random_state=126)

In [6]:
train_in_sample.shape, train_not_in_sample.shape, dev.shape

((50000, 3), (10000, 3), (10000, 3))

In [7]:
test = pd.concat([df_test_m, df_test_mm],0).reset_index(drop=True)
test.shape

(20000, 3)

### Saving results

In [8]:
df_train.to_csv("train.csv", index=False)
train_in_sample.to_csv("train_sample.csv", index=False)
train_not_in_sample.to_csv("train_not_in_sample.csv", index=False)
dev.to_csv("dev.csv", index=False)
test.to_csv("test.csv", index=False)