# To-Do:
- Get the .json files to Pandas
- Get the texts column, make those indiviual files

In [1]:
import pandas as pd

# read the data in
dev_judgement_df = pd.read_json('NER_DEV_JUDGEMENT.json')
dev_preamble_df = pd.read_json('NER_DEV_PREAMBLE.json')
train_judgement_df = pd.read_json('NER_TRAIN_JUDGEMENT.json')
train_preamble_df = pd.read_json('NER_TRAIN_PREAMBLE.json')

In [2]:
def textOnly(df):
    text_col = list(df['data'])
    actual_text = []
    for t in text_col:
        actual_text.append(t['text'])
        
    re_text = pd.DataFrame(actual_text, columns=['Text'])
    return re_text

In [3]:
# get the text column (this one has whitespaces and orig formatting)
dev_judgement_texts = textOnly(dev_judgement_df)
dev_preamble_texts = textOnly(dev_preamble_df)
train_judgement_texts = textOnly(train_judgement_df)
train_preamble_texts = textOnly(train_preamble_df)

In [4]:
# look at the text data
dev_judgement_texts.head()

Unnamed: 0,Text
0,"True, our Constitution has no 'due process' cl..."
1,(See Principles of Statutory Interpretation by...
2,"Their Lordships have said -- ""It is a sound r..."
3,"In para 13 of the plaint, it has been further ..."
4,Counsel for appellants contended that who is t...


In [5]:
# Convert data into csv files
dev_judgement_texts.to_csv('NER_Dev_Judgement.csv')
dev_preamble_texts.to_csv('NER_Dev_Preamble.csv')
train_judgement_texts.to_csv('NER_Train_Judgement.csv')
train_preamble_texts.to_csv('NER_Train_Preamble.csv')

### Preview of .csv file

```
,Text
0,"True, our Constitution has no 'due process' clause or the VIII Amendment; but, in this branch of law, after R.C. Cooper v. Union of India, (1970) 1 SCC 248 and Maneka Gandhi v. Union of India, (1978) 1 SCC 248, the consequence is the same."
1,"(See Principles of Statutory Interpretation by Justice G.P. Singh, 9th Edn., 2004 at p. 

 438.)."
```

## Alternatively...
- Here is the same thing, except the texts will be in lists - whitespace has been deleted:
- **Note**: decided not to strip punctuation because it seems like it might complicate getting the 'DATE' tag later

In [6]:
def textOnly2_splitSpace(df):
    text_col = list(df['data'])
    actual_text = []
    for t in text_col:
        split_text = [t['text'].split()]
        actual_text.append(split_text)
    re_text = pd.DataFrame(actual_text, columns=['Text'])
    return re_text

In [7]:
# get the text column (this one does NOT have whitespace)
dev_judgement_texts_noSpace = textOnly2_splitSpace(dev_judgement_df)
dev_preamble_texts_noSpace = textOnly2_splitSpace(dev_preamble_df)
train_judgement_texts_noSpace = textOnly2_splitSpace(train_judgement_df)
train_preamble_texts_noSpace = textOnly2_splitSpace(train_preamble_df)

In [8]:
# preview the data
dev_judgement_texts_noSpace.head()

Unnamed: 0,Text
0,"[True,, our, Constitution, has, no, 'due, proc..."
1,"[(See, Principles, of, Statutory, Interpretati..."
2,"[Their, Lordships, have, said, --, ""It, is, a,..."
3,"[In, para, 13, of, the, plaint,, it, has, been..."
4,"[Counsel, for, appellants, contended, that, wh..."


In [9]:
# Convert data into csv files
dev_judgement_texts_noSpace.to_csv('NER_Dev_Judgement-listed.csv')
dev_preamble_texts_noSpace.to_csv('NER_Dev_Preamble-listed.csv')
train_judgement_texts_noSpace.to_csv('NER_Train_Judgement-listed.csv')
train_preamble_texts_noSpace.to_csv('NER_Train_Preamble-listed.csv')

### Preview of .csv file
```
,Text
0,"['True,', 'our', 'Constitution', 'has', 'no', ""'due"", ""process'"", 'clause', 'or', 'the', 'VIII', 'Amendment;', 'but,', 'in', 'this', 'branch', 'of', 'law,', 'after', 'R.C.', 'Cooper', 'v.', 'Union', 'of', 'India,', '(1970)', '1', 'SCC', '248', 'and', 'Maneka', 'Gandhi', 'v.', 'Union', 'of', 'India,', '(1978)', '1', 'SCC', '248,', 'the', 'consequence', 'is', 'the', 'same.']"
1,"['(See', 'Principles', 'of', 'Statutory', 'Interpretation', 'by', 'Justice', 'G.P.', 'Singh,', '9th', 'Edn.,', '2004', 'at', 'p.', '438.).']"
```