In [1]:
import os
import re
import pandas as pd
import json
import xml.etree.ElementTree as ET
from collections import Counter
from sklearn.model_selection import train_test_split
from nltk.tokenize import sent_tokenize

In [2]:
def parse(lines, filename):
    # extract and store the start and end index of entity in .txt file
    entity_name = {"UNC", "NEG", "SEV", "COU", "CON", "drug","treatment", "problem", "test"}
    annotation = {}
    semantic_p = re.compile('semantic=(.*)')
    entity_p = re.compile('ne=(.*)')
    for line in lines:
        each = line.split("\t") 
        if each[0] == "NamedEntity" and semantic_p.findall(each[3])[0] in entity_name:
            entity = entity_p.findall(each[-1])[0]
            annotation[each[1] + '-' + each[2]] = {"Semantic": semantic_p.findall(each[3])[0], "Entity":entity.strip()}
    
    # extract info from .xmi file with the help of .txt file
    relation_name = {"CON_Of", "COU_Of", "NEG_Of", "SEV_Of", "UNC_Of"}
    entity_dict = {}
    relation_dict = {"NEG_Of":[], "negation_count": 0, 
                     "SEV_Of":[], "severity_count":0,
                     "UNC_Of":[], "uncertainty_count": 0, 
                     "CON_Of":[], 
                     "COU_Of":[], "dynamic_count": 0,}
    tree = ET.parse(filename[:-4] + '.xmi')
    root = tree.getroot()
    for child in root:
        if 'ClampNameEntityUIMA' in child.tag and child.attrib['semanticTag'] in entity_name:
            entity_dict[child.attrib['{http://www.omg.org/XMI}id']] = {"begin-end": child.attrib['begin'] + '-' + child.attrib['end'], "Semantic": child.attrib['semanticTag']}
            
        if 'ClampRelationUIMA' in child.tag and child.attrib['semanticTag'] in relation_name:
            fromEntity = annotation[entity_dict[child.attrib['entTo']]["begin-end"]]["Entity"]
            toEntity = annotation[entity_dict[child.attrib['entFrom']]["begin-end"]]["Entity"]
            if child.attrib['semanticTag'] == "CON_Of":
                relation = fromEntity + ' something ->' + toEntity
            else:
                relation = fromEntity + '->' + toEntity
            relation_dict[child.attrib['semanticTag']].append(relation)
    relation_dict["severity_count"] = 1 if (len(relation_dict["SEV_Of"]) > 0) else 0 
    relation_dict["negation_count"] = 1 if (len(relation_dict["NEG_Of"]) > 0) else 0 
    relation_dict["uncertainty_count"] = 1 if len(relation_dict["UNC_Of"]) else 0
    relation_dict["dynamic_count"] = 1 if len(relation_dict["COU_Of"]) else 0
    return relation_dict

In [3]:
whole_dict = {}
raw_text_path = "/Volumes/E/ClampCmd_1.5.1/input"
entity_annotation_path = "/Volumes/E/ClampCmd_1.5.1/output"


for files in os.listdir(raw_text_path):
    if '.' not in files:
        print(files)
        for filename in os.listdir(raw_text_path + '/' + files):
            if filename[-4:] == ".txt":
                text_f = open(raw_text_path + '/' + files + '/' + filename, 'r')
                endIndex = re.search('-',filename).span()[0]
                group = filename[:endIndex]
                if not group in whole_dict:
                    whole_dict[group] = {'annotation': {'NEG': [], 'CON':[], 'UNC':[], 'COU': [], 'treatment': []}, 'entry':{}}
                    whole_dict[group]['entry'][filename[:-4]] = {"text": text_f.readline()}
                else:
                    whole_dict[group]['entry'][filename[:-4]] = {"text": text_f.readline()}
                text_f.close()

                anno_f = open(entity_annotation_path + '/' + filename[0] + "/" + filename, 'r')
                anotattion = parse(anno_f.readlines(), (entity_annotation_path + '/' + filename[0] + "/" + filename))
                whole_dict[group]['entry'][filename[:-4]].update(anotattion)
                if anotattion['NEG_Of'] != []:
                    for each in anotattion['NEG_Of']:
                        whole_dict[group]['annotation']['NEG'].append(each)
                if anotattion['CON_Of'] != []:
                    for each in anotattion['CON_Of']:
                        whole_dict[group]['annotation']['CON'].append(each)
                if anotattion['UNC_Of'] != []:
                    for each in anotattion['UNC_Of']:
                        whole_dict[group]['annotation']['UNC'].append(each)
                if anotattion['COU_Of'] != []:
                    for each in anotattion['COU_Of']:
                        whole_dict[group]['annotation']['COU'].append(each)
                anno_f.close()

A
B
C
D
E
F
G
H
I
K
L
M
N
O
P
Q
R
S
T
U
V
W


In [213]:
with open('patient_text_profile.json', 'w') as f:
    json.dump(whole_dict, f)

In [454]:
# Load stats data
stat = {}
f = open("group_profiling.txt",'r')
for each in f.readlines():
    data = json.loads(each)
    stat[data['group']] = {'members': data['member'], 'post': data['post']}

In [455]:
# Load unique poster stats data
stat_unique = {}
with open("unique_poster.txt") as json_file:
    data = json.load(json_file)
for key, value in data.items():
    stat_unique[key] = {'unique_poster': value}

In [456]:
# merge stats and all ratio metrics
profiling = {}
for group, files in whole_dict.items():
    negation_count = 0
    uncertainty_count = 0
    severity_count = 0
    dynamic_count = 0
    count = Counter()
    for each in files['entry'].values():
        negation_count += each['negation_count']
        uncertainty_count += each['uncertainty_count']
        severity_count += each['severity_count']
        dynamic_count += each['dynamic_count']
        for word in each['SEV_Of']:
            p = re.compile('(.*)->\w+')
            count[p.findall(word)[0]] += 1
    profiling[group] = {}
    profiling[group].update(stat[group])
    profiling[group].update(stat_unique[group])
    profiling[group]['severity_top'] = count.most_common(1)[0][0]
    profiling[group]['negation_ratio'] = negation_count/profiling[group]['post']
    profiling[group]['uncertainty_ratio'] = uncertainty_count/profiling[group]['post']
    profiling[group]['severity_ratio'] = severity_count/profiling[group]['post']
    profiling[group]['dynamic_ratio'] = dynamic_count/profiling[group]['post']
    profiling[group].update(whole_dict[group]['annotation'])
df = pd.read_json(json.dumps(profiling), orient='index')
df = df[['members', 'unique_poster','post','severity_top','severity_ratio', 'uncertainty_ratio', 'dynamic_ratio', 'negation_ratio', 'UNC', 'COU', 'CON', 'NEG']]

# Statistical profiling of groups in terms of Linguistics

In [445]:
df = pd.read_csv('groups_profiling.csv')
df.head(30)

# df.sort_values(by=['dynamic_ratio' ], ascending=False)

Unnamed: 0.1,Unnamed: 0,members,unique_poster,post,severity_top,severity_ratio,uncertainty_ratio,dynamic_ratio,negation_ratio
0,Abdominal_Disorders,6468,4950,40915,severe,0.044287,0.099401,0.024612,0.080288
1,Abscess_Non_dental,409,223,1095,severe,0.03379,0.109589,0.027397,0.115068
2,Accidents_and_Injuries,1801,1170,7246,severe,0.028982,0.060033,0.025393,0.067486
3,ACE_Inhibitors,229,82,352,severe,0.042614,0.068182,0.019886,0.065341
4,Acne,654,273,1127,severe,0.047028,0.044366,0.023957,0.047915
5,Adrenal_Disorders,501,324,2204,severe,0.032214,0.104809,0.028584,0.077586
6,Alcohol_Consumption,1529,1191,32247,severe,0.010265,0.02577,0.008652,0.036593
7,Alendronic_Acid,278,138,1522,severe,0.026938,0.052562,0.023653,0.061761
8,Allergic_Disorders,646,335,1472,severe,0.030571,0.10462,0.034647,0.080163
9,Alopecia_and_Hair_Disorders,528,280,1200,severe,0.015,0.064167,0.018333,0.0525


In [438]:
df.to_csv('groups_profiling.csv')

# WordCould of Specified Details

In [398]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
stopwords = set(STOPWORDS)
stopwords.update(["problems","problem", "not", 'without', 'never','pain', 'symptom', 'symptoms', 'issue', 'issues', 'increase', 'improve', 'worse', 'reduce', 'low', 'slightly', 'severe', 'lower', 'bad', 'damage', 'drop', 'less', 'much', ''])
import matplotlib.pyplot as plt

In [399]:
def collect(whole_dict, target_groups, concept):
    text = ""
    for each in whole_dict[target_groups]['annotation'][concept]:
        filtered_each = re.findall('(.*)->(.*)', each)
        text += ' ' + filtered_each[0][1]
    return text

In [None]:
target_group = 'Abdominal_Disorders'
target_group = 'Hip_Replacement'
target_group = 'Knee_Problems'

target_groups = whole_dict.keys() 
for target_group in target_groups:
    target_concepts = ['UNC','COU', 'CON','NEG']
    fig, _axs = plt.subplots(nrows=2, ncols=2, figsize=(50,25))
    axs = _axs.flatten()
    for index, each in enumerate(target_concepts):
        text = collect(whole_dict, target_group, each)

        wordcloud = WordCloud(stopwords=stopwords, background_color="black").generate(text)
        axs[index].set_title(target_group + ' : ' + each, fontsize=50)
        axs[index].imshow(wordcloud, interpolation='bilinear')
        axs[index].axis("off")
    fig.savefig('WordCould_of_Groups_in_Linguistics/'+ target_group + '.png')
# plt.show()



# New corpus for annotation

In [164]:
def isDelays(sentence):
    if len(sentence) < 100 and ('?' in sentence or ' if ' in sentence):
        return False
    if ' wait ' in sentence \
    or ' delay ' in sentence \
    or ' slow ' in sentence \
    or ' for so long ' in sentence \
    or ' be so long ' in sentence \
    or ' take so long ' in sentence \
    or ' long time ' in sentence \
    or ' longer ' in sentence \
    or 'postpone' in sentence \
    or ' reschedule ' in sentence \
    or ' numerous time' in sentence \
    or ' quicker ' in sentence \
    or '  take forever' in sentence \
    or ' prolong ' in sentence:
        return True
    if (' ridiculous ' in sentence 
        or ' unacceptable 'in sentence) and \
    ('months' in sentence \
     or 'days' in sentence \
     or 'weeks' in sentence \
     or 'hours' in sentence \
     or 'years' in sentence \
     or 'month' in sentence \
     or 'day' in sentence \
     or 'week' in sentence \
     or 'hour' in sentence \
     or 'year' in sentence):
        return True
    return False

def isCosts(sentence):
    if len(sentence) < 100 and ('?' in sentence or ' if ' in sentence):
        return False
    if ' cost ' in sentence \
    or ' costly ' in sentence \
    or ' pay for ' in sentence \
    or 'payment' in sentence \
    or 'afford' in sentence \
    or 'expensive' in sentence \
    or 'cheap' in sentence \
    or ' money ' in sentence \
    or 'limit' in sentence \
    or ' overprice ' in sentence \
    or ' price' in sentence \
    or ' copay' in sentence \
    or ' bill ' in sentence \
    or ' co-pay ' in sentence \
    or ' fee ' in sentence \
    or ' charge ' in sentence \
    or ' deductible ' in sentence \
    or ' premium ' in sentence \
    or 'out-of-network' in sentence \
    or 'out of network' in sentence \
    or 'in-network' in sentence \
    or 'in network' in sentence:
        return True
    return False

def isAccess(sentence):
    if len(sentence) < 100 and ('?' in sentence or ' if ' in sentence):
        return False
    if ' access to ' in sentence \
    or ' be refuse ' in sentence \
    or ' accept new patient ' in sentence \
    or ' be cancel ' in sentence \
    or 'consultation' in sentence \
    or 'registration' in sentence \
    or 'shortage' in sentence \
    or ' crowd ' in sentence \
    or ' reschedule ' in sentence \
    or 'cancellation' in sentence \
    or ' ward ' in sentence \
    or 'available' in sentence \
    or 'reject' in sentence \
    or 'to service' in sentence \
    or 'admission' in sentence \
    or 'out-of-network' in sentence \
    or 'out of network' in sentence \
    or 'in-network' in sentence \
    or 'in network' in sentence \
    or 'limitation' in sentence \
    or ' nearby hospital' in sentence \
    or ' nearby clinic ' in sentence:
        return True
    if ' limit ' in sentence \
    and 'access' in sentence:
        return True
    if ' treat' in sentence \
    and ' refus' in sentence:
        return True
    return False

     
def isErrors(sentence):
    if len(sentence) < 100 and ('?' in sentence or ' if ' in sentence):
        return False
    if ' inept ' in sentence and 'doctor' in sentence:
        return True
    if ' miss diagnosis ' in sentence \
    or ' mis-diagnosis ' in sentence \
    or ' misdiagnose ' in sentence \
    or ' misdiagnosis ' in sentence \
    or ' clerical ' in sentence \
    or ' erroneous ' in sentence \
    or ' wrong disease ' in sentence \
    or ' typo ' in sentence \
    or ' wrong medi' in sentence \
    or ' medical accident' in sentence \
    or ' incorrect ' in sentence \
    or ' inconclusive ' in sentence \
    or ' fault ' in sentence \
    or ' mislead ' in sentence \
    or ' examination ' in sentence \
    or ' incompetent ' in sentence \
    or 'judgement' in sentence \
    or 'judgment' in sentence \
    or ' miss ' in sentence:
        return True
    if ' not work ' in sentence:
        return True
    if ('make' and 'mistake' in sentence) \
    or ' be a mistake' in sentence:
        return True
    return False

def isTreatments(sentence):
    if len(sentence) < 100 and ('?' in sentence or ' if ' in sentence):
        return False
    if ' get better ' in sentence \
    and ' you ' not in sentence \
    and 'hope' not in sentence \
    and 'life ' not in sentence \
    and ' will ' not in sentence \
    and '?' not in sentence \
    and ' if ' not in sentence \
    and ' to get better ' not in sentence \
    and ' can get better ' not in sentence \
    and ' it ll ' not in sentence \
    and 'i' in sentence:
        return True
    if ' feel better ' in sentence \
    and ' you ' not in sentence \
    and 'hope' not in sentence \
    and ' will ' not in sentence \
    and '?' not in sentence \
    and ' if ' not in sentence \
    and ' can feel better ' not in sentence \
    and ' would ' not in sentence \
    and ' should feel better' not in sentence \
    and 'i' in sentence:
        return True
    if ' get worse ' in sentence \
    and ' you ' not in sentence \
    and 'hope' not in sentence \
    and 'life ' not in sentence \
    and ' will ' not in sentence \
    and '?' not in sentence \
    and ' if ' not in sentence \
    and ' to get worse ' not in sentence \
    and ' can get worse ' not in sentence \
    and ' it ll ' not in sentence \
    and 'i' in sentence:
        return True
    if ' feel worse ' in sentence \
    and ' you ' not in sentence \
    and 'hope' not in sentence \
    and ' will ' not in sentence \
    and '?' not in sentence \
    and ' if ' not in sentence \
    and ' can feel worse ' not in sentence \
    and ' would ' not in sentence \
    and ' should feel worse' not in sentence \
    and 'i' in sentence:
        return True

    if ' treatment ' in sentence \
    and 'hope' not in sentence:
        return True

    if ' be never end ' in sentence \
    or ' relapse ' in sentence \
    or ' still sick' in sentence \
    or ' still ill' in sentence \
    or ' recurrent ' in sentence \
    or ' constant pain ' in sentence \
    or ' be cure ' in sentence \
    or ' assessment ' in sentence \
    or ' rehabilitation ' in sentence \
    or ' good therapy ' in sentence \
    or ' great therapy ' in sentence \
    or ' excellent therapy ' in sentence \
    or ' prognosis ' in sentence \
    or (' therapy '  in sentence and ' not work '  in sentence):
        return True
    return False



def isStaffAndTrust(sentence):
    if len(sentence) < 100 and ('?' in sentence or ' if ' in sentence):
        return False
    # staff attitude and patient trust
    if 'manager' in sentence \
    or 'staff' in sentence \
    or ' nurse' in sentence \
    or 'therapist' in sentence \
    or 'attitude' in sentence \
    or 'unethical' in sentence \
    or 'ethical' in sentence \
    or ' be care' in sentence \
    or 'uncaring' in sentence \
    or 'obnoxious' in sentence \
    or 'lovely' in sentence \
    or 'dismissive' in sentence \
    or ' so kind' in sentence \
    or 'cruel' in sentence \
    or 'disrespectful' in sentence \
    or ' respectful ' in sentence \
    or ' empathetic ' in sentence \
    or ' empathy ' in sentence \
    or ' sympathy ' in sentence \
    or 'sympathetic' in sentence \
    or ' trust' in sentence \
    or 'lack of concern' in sentence \
    or ' rude' in sentence \
    or ' dismiss me' in sentence \
    or 'lack of respect' in sentence \
    or 'polite' in sentence \
    or ' uncaring ' in sentence \
    or ' manner' in sentence \
    or 'so mean' in sentence \
    or 'aggressive' in sentence \
    or 'insensitive' in sentence \
    or 'neglect' in sentence \
    or 'lack of understand' in sentence \
    or 'confidence' in sentence \
    or 'privacy' in sentence:
        return True
    if ' be understand ' in sentence \
    and 'husband' not in sentence \
    and 'wife' not in sentence \
    and 'employer' not in sentence \
    and 'boyfriend' not in sentence \
    and 'girlfriend' not in sentence \
    and 'i be understand' not in sentence \
    and 'family' not in sentence:
        return True

    # staff’s professional skills and conduct
    if ' skill' in sentence \
    or 'treat well' in sentence \
    or 'treat bad' in sentence \
    or 'unclear' in sentence \
    or ' good treat ' in sentence \
    or ' great treat ' in sentence \
    or ' poor treat ' in sentence \
    or ' bad treat ' in sentence \
    or ' worst treat ' in sentence \
    or ' good care' in sentence \
    or ' great care' in sentence \
     or ' excellent care' in sentence \
    or ' poor care' in sentence \
    or ' bad care' in sentence \
    or ' inept ' in sentence \
    or 'poor communication' in sentence \
    or ' expertise' in sentence \
    or 'fob off' in sentence \
    or 'helpful' in sentence \
    or 'unprofessional' in sentence \
    or 'shout at' in sentence \
    or 'irresponsible' in sentence \
    or 'unqualified' in sentence \
    or 'qualify' in sentence \
    or 'lack of train' in sentence \
    or 'no train' in sentence \
    or 'incapable' in sentence \
    or 'unwilling' in sentence \
    or 'corrupt' in sentence \
    or 'ignore' in sentence \
    or 'mislead' in sentence \
    or 'lack of knowledge' in sentence \
    or ' bad behavi' in sentence \
    or 'not listen' in sentence \
    or 'inappropriate' in sentence \
    or 'dedicate' in sentence \
    or 'very professional' in sentence \
    or 'so professional' in sentence:
        return True
    if ' look after me' in sentence \
    and 'friend' not in sentence \
    and 'husband' not in sentence \
    and 'wife' not in sentence:
        return True
    return False

def hasCourse(sentence, cou_realtions):
    if len(cou_realtions) > 0 \
    and 'hope' not in sentence \
    and '?' not in sentence \
    and ' will ' not in sentence \
    and ' if ' not in sentence \
    and ' you ' not in sentence:
        return True
    return False

In [165]:
def parseSentense(lines, root, text):
    sen_index = []
    treatment_index = []
    test_index = []
    cou_of_index = []
    sentence_split = []
    treatments = []
    tests = []
    cou_of = []
    for line in lines:
        if line[:11] == 'NamedEntity' and line[31:40] == 'treatment':
            treatment_index.append([int(line.split("\t")[1]), int(line.split("\t")[2]), line.split("\t")[7][3:].strip()])
        if line[:11] == 'NamedEntity' and line[31:35] == 'test':
            test_index.append([int(line.split("\t")[1]), int(line.split("\t")[2]), line.strip().split("\t")[7][3:].strip()])
        if line[:8] == 'Sentence':
            sen_index.append([int(line.split("\t")[1]), int(line.split("\t")[2])])
    
    entity_dict = {}
    for child in root:
        if 'ClampNameEntityUIMA' in child.tag and child.attrib['semanticTag'] in set(["COU", "problem"]):
            entity_dict[child.attrib['{http://www.omg.org/XMI}id']] = {"begin-end": (int(child.attrib['begin']), int(child.attrib['end'])), "Semantic": child.attrib['semanticTag']}
            
        if 'ClampRelationUIMA' in child.tag and child.attrib['semanticTag'] == "COU_Of":
            cou_of_index.append((entity_dict[child.attrib['entTo']]["begin-end"][0],
                                entity_dict[child.attrib['entTo']]["begin-end"][1], 
                                entity_dict[child.attrib['entFrom']]["begin-end"][0],
                                entity_dict[child.attrib['entFrom']]["begin-end"][1]))
    for index in sen_index:
        sentence_split.append(text[index[0]:index[1]])
        treatment = []
        for t_index in treatment_index:
            if t_index[0] >= index[0] and t_index[1] <= index[1]:
                treatment.append(t_index[2])
        treatments.append(treatment)
                                
        test = []
        for te_index in test_index:
            if te_index[0] >= index[0] and te_index[1] <= index[1]:
                test.append(te_index[2])
        tests.append(test)
        
        cou = []
        for cou_index in cou_of_index:
            if cou_index[0] >= index[0] \
            and cou_index[1] <= index[1] \
            and cou_index[2] >= index[0] \
            and cou_index[3] <= index[1]:
                cou.append(text[cou_index[0]:cou_index[1]] + '->' + text[cou_index[2]:cou_index[3]])
        cou_of.append(cou)
    return (sentence_split, treatments, tests, cou_of)

In [166]:
raw_text_path = "/Volumes/E/ClampCmd_1.5.1/input"
entity_annotation_path = "/Volumes/E/ClampCmd_1.5.1/output"
sentense_dic  = []
countforannotation = {'isDelays': 0, 'isCosts': 0, 'isAccess': 0, 'isErrors': 0, 'isTreatments': 0, 'isStaffAndTrust': 0}

for files in os.listdir(raw_text_path):
    if '.' not in files:
        print(files)
        for filename in os.listdir(raw_text_path + '/' + files):
            if filename[-4:] == ".txt":
                text_f = open(raw_text_path + '/' + files + '/' + filename, 'r')
                endIndex = re.search('-',filename).span()[0]
                group = filename[:endIndex]
                text = text_f.readline()
                text_f.close()
            
                anno_f = open(entity_annotation_path + '/' + filename[0] + "/" + filename, 'r')
                tree = ET.parse(entity_annotation_path + '/' + filename[0] + "/" + filename[:-4] + '.xmi')
                root = tree.getroot()
                sentences, treatments, test, cou_realtions = parseSentense(anno_f.readlines(), root, text)
                anno_f.close()
                count = 0
                for sen, treat, te, cou_realtion in zip(sentences, treatments, test, cou_realtions):
                    count += 1
                    trainOrtest = 'test'
                    train_class = []
                    if isDelays(sen):
                        countforannotation['isDelays'] += 1
                        train_class.append(1)
                        trainOrtest = 'train'
                    if isCosts(sen):
                        countforannotation['isCosts'] += 1
                        train_class.append(2)
                        trainOrtest = 'train'
                    if isAccess(sen):
                        countforannotation['isAccess'] += 1
                        train_class.append(3)
                        trainOrtest = 'train'
                    if isErrors(sen):
                        countforannotation['isErrors'] += 1
                        train_class.append(4)
                        trainOrtest = 'train'
                    if isTreatments(sen) or hasCourse(sen, cou_realtion):
                        countforannotation['isTreatments'] += 1
                        train_class.append(5)
                        trainOrtest = 'train'
                    if isStaffAndTrust(sen):
                        countforannotation['isStaffAndTrust'] += 1
                        train_class.append(6)
                        trainOrtest = 'train'
                    sentense_dic.append({'group': group, 'id': filename[:-4] + '-' +str(count), 'text': sen, 'treatment': treat, 'couse_of_problem': cou_realtion, 'test': te, 'trainOrtest': trainOrtest, 'aspect': train_class})


A
B
C
D
E
F
G
H
I
K
L
M
N
O
P
Q
R
S
T
U
V
W


In [172]:
whole_df = pd.DataFrame(sentense_dic)

In [173]:
whole_df.head()

Unnamed: 0,aspect,couse_of_problem,group,id,test,text,trainOrtest,treatment
0,[],[],Abdominal_Disorders,Abdominal_Disorders--12291-0-1,[],i be 34 have have a stoma since 28.,test,[]
1,[],[],Abdominal_Disorders,Abdominal_Disorders--12291-0-2,[],it have ruin my life and my life be over .,test,[]
2,[],[],Abdominal_Disorders,Abdominal_Disorders--12291-0-3,[],there be no help for us we be on our own compl...,test,[]
3,[],[],Abdominal_Disorders,Abdominal_Disorders--12291-1-1,[],it be the embarrasment or other people embarra...,test,[]
4,[],[],Abdominal_Disorders,Abdominal_Disorders--12291-1-2,[],etc .,test,[]


In [174]:
whole_df.to_csv('whole_sentence_level.csv')

In [None]:
read_csv()

In [167]:
print('Total sentences to be annotaed:', sum(countforannotation.values()))
countforannotation

Total sentences to be annotaed: 582358


{'isDelays': 153881,
 'isCosts': 63945,
 'isAccess': 29360,
 'isErrors': 52332,
 'isTreatments': 135823,
 'isStaffAndTrust': 147017}

In [169]:
isDelays_ = []
isCosts_ = []
isAccess_ = []
isErrors_ = []
isTreatments_ = []
isStaffAndTrust_ = []
for each in sentense_dic:
    if 1 in each['aspect']:
        isDelays_.append(each)
    if 2 in each['aspect']:
        isCosts_.append(each)
    if 3 in each['aspect']:
        isAccess_.append(each)
    if 4 in each['aspect']:
        isErrors_.append(each)
    if 5 in each['aspect']:
        isTreatments_.append(each)
    if 6 in each['aspect']:
        isStaffAndTrust_.append(each)
        
isDelays_df = pd.DataFrame(isDelays_)
isCosts_df = pd.DataFrame(isCosts_)
isAccess_df = pd.DataFrame(isAccess_)  
isErrors_df = pd.DataFrame(isErrors_)  
isTreatments_df = pd.DataFrame(isTreatments_)  
isStaffAndTrust_df = pd.DataFrame(isStaffAndTrust_) 

isDelays_df['sub_aspect'] = ''
isDelays_df['sentiment'] = ''
isCosts_df['sub_aspect'] = ''
isCosts_df['sentiment'] = ''
isAccess_df['sub_aspect'] = ''
isAccess_df['sentiment'] = ''
isErrors_df['sub_aspect'] = ''
isErrors_df['sentiment'] = ''
isTreatments_df['sub_aspect'] = ''
isTreatments_df['sentiment'] = ''
isStaffAndTrust_df['sub_aspect'] = ''
isStaffAndTrust_df['sentiment'] = ''

isDelays_df_rest, isDelays_df_sample = train_test_split(isDelays_df, test_size=0.1, random_state=1)
isCosts_df_rest1, isCosts_df_sample = train_test_split(isCosts_df, test_size=0.1, random_state=1)
isAccess_df_rest, isAccess_df_sample = train_test_split(isAccess_df, test_size=0.1, random_state=1)
isErrors_df_rest, isErrors_df_sample = train_test_split(isErrors_df, test_size=0.1, random_state=1)
isTreatments_df_rest, isTreatments_df_sample = train_test_split(isTreatments_df, test_size=0.1, random_state=1)
isStaffAndTrust_df_rest, isStaffAndTrust_df_sample = train_test_split(isStaffAndTrust_df, test_size=0.1, random_state=1)

isDelays_df_sample.to_csv('isDelays_sample.csv', index=False)
isCosts_df_sample.to_csv('isCosts_sample.csv', index=False)
isAccess_df_sample.to_csv('isAccess_sample.csv', index=False)
isErrors_df_sample.to_csv('isErrors_sample.csv', index=False)
isTreatments_df_sample.to_csv('isTreatments_sample.csv', index=False)
isStaffAndTrust_df_sample.to_csv('isStaffAndTrust_sample.csv', index=False)

# isDelays_df_rest.to_csv('isDelays_rest.csv', index=False)
# isCosts_df_rest.to_csv('isCosts_rest.csv', index=False)
# isAccess_df_rest.to_csv('isAccess_rest.csv', index=False)
# isErrors_df_rest.to_csv('isErrors_rest.csv', index=False)
# isTreatments_df_rest.to_csv('isTreatments_rest.csv', index=False)
# isStaffAndTrust_df_rest.to_csv('isStaffAndTrust_rest.csv', index=False)

In [50]:
n = len(isDelays_df_sample)
batch = (int) (n / 10)
isDelays_df_sample_tenFold = pd.DataFrame()
for i in range(1, 11):
    batch_df = isDelays_df_sample[batch * (i - 1):batch * i]
    isDelays_df_sample_tenFold['group-' + str(i)] = list(batch_df['group'])
    isDelays_df_sample_tenFold['id-' + str(i)] = list(batch_df['id'])
    isDelays_df_sample_tenFold['text-' + str(i)] = list(batch_df['text'])
    isDelays_df_sample_tenFold['trainOrtest-' + str(i)] = list(batch_df['trainOrtest'])

In [80]:
len(isDelays_df_rest)

142617

In [None]:
sentence_df = pd.read_json(json.dumps(sentense_dic), orient='records')

In [712]:
sentense_dic

8389152