In [1]:
from tqdm import tqdm
import os
import pandas as pd
from sklearn.model_selection import KFold

In [2]:
data = pd.read_csv('/root/projects/feedback_prize/data/process_data/all_train_texts_5.csv')

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,id,text,text_split,entities,fold
0,0,0000D23A521A,"Some people belive that the so called ""face"" o...","['Some', 'people', 'belive', 'that', 'the', 's...",B-Position#I-Position#I-Position#I-Position#I-...,0.0
1,1,00066EA9880D,Driverless cars are exaclty what you would exp...,"['Driverless', 'cars', 'are', 'exaclty', 'what...",B-Lead#I-Lead#I-Lead#I-Lead#I-Lead#I-Lead#I-Le...,4.0
2,2,000E6DE9E817,Dear: Principal\n\nI am arguing against the po...,"['Dear:', 'Principal', 'I', 'am', 'arguing', '...",O#O#B-Position#I-Position#I-Position#I-Positio...,3.0
3,3,001552828BD0,Would you be able to give your car up? Having ...,"['Would', 'you', 'be', 'able', 'to', 'give', '...",B-Lead#I-Lead#I-Lead#I-Lead#I-Lead#I-Lead#I-Le...,0.0
4,4,0016926B079C,I think that students would benefit from learn...,"['I', 'think', 'that', 'students', 'would', 'b...",B-Position#I-Position#I-Position#I-Position#I-...,4.0


In [14]:
IGNORE_INDEX = -100
NON_LABEL = -1
classes = ['Lead', 'Position', 'Claim','Counterclaim', 'Rebuttal','Evidence','Concluding Statement']
OUTPUT_LABELS = ['O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim', 
                 'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']
LABELS_TO_IDS = {v:k for k,v in enumerate(OUTPUT_LABELS)}
IDS_TO_LABELS = {k:v for k,v in enumerate(OUTPUT_LABELS)}

In [15]:
print(LABELS_TO_IDS)

{'O': 0, 'B-Lead': 1, 'I-Lead': 2, 'B-Position': 3, 'I-Position': 4, 'B-Claim': 5, 'I-Claim': 6, 'B-Counterclaim': 7, 'I-Counterclaim': 8, 'B-Rebuttal': 9, 'I-Rebuttal': 10, 'B-Evidence': 11, 'I-Evidence': 12, 'B-Concluding Statement': 13, 'I-Concluding Statement': 14}


In [16]:
print(IDS_TO_LABELS)

{0: 'O', 1: 'B-Lead', 2: 'I-Lead', 3: 'B-Position', 4: 'I-Position', 5: 'B-Claim', 6: 'I-Claim', 7: 'B-Counterclaim', 8: 'I-Counterclaim', 9: 'B-Rebuttal', 10: 'I-Rebuttal', 11: 'B-Evidence', 12: 'I-Evidence', 13: 'B-Concluding Statement', 14: 'I-Concluding Statement'}


In [17]:
def agg_essays(train_flg, data_dir):
    folder = 'train' if train_flg else 'test'
    names, texts =[], []
    for f in tqdm(list(os.listdir(f'{data_dir}/{folder}'))):
        names.append(f.replace('.txt', ''))
        texts.append(open(f'{data_dir}/{folder}/' + f, 'r').read())
        df_texts = pd.DataFrame({'id': names, 'text': texts})

    df_texts['text_split'] = df_texts.text.str.split()
    print('Completed tokenizing texts.')
    return df_texts

In [18]:
def ner(df_texts, df_train):
    all_entities = []
    for _,  row in tqdm(df_texts.iterrows(), total=len(df_texts)):
        total = len(row['text_split'])
        entities = ['O'] * total
        for _, row2 in df_train[df_train['id'] == row['id']].iterrows():
            discourse = row2['discourse_type']
            list_ix = [int(x) for x in row2['predictionstring'].split(' ')]
            entities[list_ix[0]] = f'B-{discourse}'
            for k in list_ix[1:]: 
                entities[k] = f'I-{discourse}'
            entities_str = '#'.join(entities)
        all_entities.append(entities_str)

    df_texts['entities'] = all_entities
    print('Completed mapping discourse to each token.')
    return df_texts

In [19]:
def preprocess(data_dir, df_train = None, remove_wrong_labels=True):
    if df_train is None:
        train_flg = False
    else:
        train_flg = True
    
    df_texts = agg_essays(train_flg, data_dir)

    if train_flg:
        # if remove_wrong_labels:
        #     for wrong_id in wrong_label_discourse_ids:
        #         df_train = df_train.drop(index=(df_train.loc[df_train['discourse_id'] == wrong_id].index))
            
        df_texts = ner(df_texts, df_train)
        
    return df_texts

In [20]:
data_dir = '/root/projects/feedback_prize/data'
df_train = pd.read_csv('/root/projects/feedback_prize/data/train.csv')
train_texts = preprocess(data_dir,df_train)

100%|██████████| 15594/15594 [00:20<00:00, 754.40it/s]


Completed tokenizing texts.


100%|██████████| 15594/15594 [01:52<00:00, 138.62it/s]

Completed mapping discourse to each token.





In [21]:
train_texts.head()

Unnamed: 0,id,text,text_split,entities
0,0000D23A521A,"Some people belive that the so called ""face"" o...","[Some, people, belive, that, the, so, called, ...",B-Position#I-Position#I-Position#I-Position#I-...
1,00066EA9880D,Driverless cars are exaclty what you would exp...,"[Driverless, cars, are, exaclty, what, you, wo...",B-Lead#I-Lead#I-Lead#I-Lead#I-Lead#I-Lead#I-Le...
2,000E6DE9E817,Dear: Principal\n\nI am arguing against the po...,"[Dear:, Principal, I, am, arguing, against, th...",O#O#B-Position#I-Position#I-Position#I-Positio...
3,001552828BD0,Would you be able to give your car up? Having ...,"[Would, you, be, able, to, give, your, car, up...",B-Lead#I-Lead#I-Lead#I-Lead#I-Lead#I-Lead#I-Le...
4,0016926B079C,I think that students would benefit from learn...,"[I, think, that, students, would, benefit, fro...",B-Position#I-Position#I-Position#I-Position#I-...


In [22]:
ids = train_texts['id'].unique()

In [24]:
kf = KFold(n_splits=5, shuffle = True, random_state=42)

In [45]:
ka = kf.split(ids)

In [46]:
j = -1
for i in ka:
    print(j + 1)
    print(len(i[0]))
    print(len(i[1]))

0
12475
3119
0
12475
3119
0
12475
3119
0
12475
3119
0
12476
3118


In [35]:
for i_fold, (_, valid_index) in enumerate(kf.split(ids)):
    print(_)
    df_train.loc[valid_index,'fold'] = i_fold

[    1     2     4 ... 15590 15591 15593]
[    0     1     2 ... 15591 15592 15593]
[    0     1     2 ... 15588 15591 15592]
[    0     1     3 ... 15591 15592 15593]
[    0     2     3 ... 15590 15592 15593]
