## Load Required Packages & Tools...

In [1]:
import nltk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statistics

In [3]:
!pip install mendelai-brat-parser

Collecting mendelai-brat-parser
  Downloading mendelai_brat_parser-0.0.11.tar.gz (4.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: mendelai-brat-parser
  Building wheel for mendelai-brat-parser (setup.py) ... [?25l[?25hdone
  Created wheel for mendelai-brat-parser: filename=mendelai_brat_parser-0.0.11-py3-none-any.whl size=4920 sha256=ebe3aabbe9365e1df9e88c60d0c0e30491a358fb93a39867d16a26e51158613f
  Stored in directory: /root/.cache/pip/wheels/66/6d/54/75653ad4624c60f22e21173848881fdf2af79baafab35e13a9
Successfully built mendelai-brat-parser
Installing collected packages: mendelai-brat-parser
Successfully installed mendelai-brat-parser-0.0.11


In [4]:
from brat_parser import get_entities_relations_attributes_groups

## Read in Training & Test Datasets...

In [5]:
from google.colab import drive
drive.mount('/content/drive')

wdir = '/content/drive/MyDrive/BMI_6330-Natural_Language_Processing'

train = pd.read_csv(wdir+"/TRAIN.csv")
test = pd.read_csv(wdir+"/TEST.csv")

train.drop(labels="Unnamed: 0",axis=1, inplace=True)
test.drop(labels="Unnamed: 0",axis=1, inplace=True)

Mounted at /content/drive


In [6]:
print('Number of Posts in Training Set: ' + str(len(train)) + " + Number of Posts in Test Set: " + str(len(test)))

Number of Posts in Training Set: 1600 + Number of Posts in Test Set: 401


In [7]:
train.head()

Unnamed: 0,text,Non_Medical_Use,Withdrawl,Tolerance,Entities
0,this might seem like a silly post - but i've o...,0,0,0,"{'T1': Entity(id='T1', type='ADHD', span=((499..."
1,i've been off of it for over 3 years (personal...,1,0,0,"{'T1': Entity(id='T1', type='Adderall', span=(..."
2,i’m so confused! please has anyone felt this? ...,0,1,0,{}
3,"i’m on 5mg, but will take 10mg when needed in ...",0,0,0,"{'T1': Entity(id='T1', type='Anxiety', span=((..."
4,"hey all,\nso i recently got diagnosed with adh...",0,0,0,"{'T1': Entity(id='T1', type='ADHD', span=((41,..."


In [8]:
test.head()

Unnamed: 0,text,Non_Medical_Use,Withdrawl,Tolerance,Entities
0,i have one pill left. \ni should have gone las...,0,0,0,{}
1,my health insurance doesn't really do anything...,0,0,0,{}
2,\ni’ve been sick for a week so the next time i...,0,0,1,"{'T1': Entity(id='T1', type='ADHD', span=((71,..."
3,so i’m going to the doctors monday because i’v...,0,0,0,"{'T1': Entity(id='T1', type='Adderall', span=(..."
4,hi guys thanks for taking the time to read thi...,1,0,0,"{'T1': Entity(id='T1', type='Adderall', span=(..."


## Add Classification Data to Training and Test Sets...

In [43]:
abuse = '1014, 1018, 1034, 1056, 1082, 1098, 1102, 1103, 1109, 1119, 1123, 1137, 1140, 1156, 1168, 1179, 1186, 1187, 1190, 1197, 1205, 1206, 1210, 1213, 1214, 1224, 1228, 1229, 1241, 1254, 1301, 1304, 1307, 1310, 1312, 1313, 1317, 132, 1320, 1331, 1334, 1338, 1353, 1367, 1410, 1429, 143, 1442, 1449, 1452, 1461, 1471, 1484, 1502, 1509, 152, 1523, 1528, 1533, 1534, 155, 1554, 1573, 1590, 1602, 1610, 1619, 1632, 1643, 1644, 1645, 1675, 1682, 1688, 1695, 1697, 1698, 1700, 1701, 1707, 1708, 1709, 1712, 1713, 1714, 1750, 1777, 1784, 1797, 1803, 1832, 1839, 1863, 1873, 1877, 1878, 1886, 190, 1901, 1906, 1916, 1926, 1934, 1936, 1941, 1942, 1960, 1965, 1967, 1970, 990, 989, 984, 972, 969, 966, 963, 949, 93, 925, 920, 916, 911, 906, 898, 895, 893, 891, 865, 84, 839, 826, 814, 813, 805, 798, 785, 77, 767, 766, 720, 717, 674, 657, 630, 586, 569, 552, 533, 498, 466, 463, 417, 408, 407, 402, 4, 36, 345, 326, 307, 298, 254, 247, 21'
withdrawl = '1019, 1049, 106, 1081, 1124, 1140, 1151, 1167, 1171, 1188, 1202, 1214, 1230, 1231, 125, 1278, 1283, 1286, 1302, 1365, 1376, 1385, 1386, 1391, 1398, 1408, 1410, 1419, 1451, 1501, 1502, 1507, 1510, 1512, 1519, 1533, 1537, 1538, 1548, 1554, 156, 1569, 157, 1619, 166, 1660, 1699, 1720, 173, 1751, 1758, 1801, 1805, 1809, 183, 1839, 1855, 1873, 1898, 19, 1904, 1911, 1922, 1933, 1941, 195, 1965, 1975, 1977, 1991, 1992, 1993, 984, 983, 975, 97, 967, 955, 939, 934, 932, 93, 893, 878, 860, 833, 826, 818, 814, 813, 81, 8, 776, 756, 750, 738, 68, 657, 643, 641, 635, 632, 61, 536, 535, 529, 509, 424, 423, 406, 403, 391, 39, 37, 317, 316, 267, 253, 248, 242, 241, 233, 215'
tolerance = '102, 1018, 1040, 1043, 106, 109, 1092, 1113, 1152, 1172, 1182, 1186, 119, 1217, 122, 1258, 1260, 1262, 1263, 1323, 1338, 1342, 1352, 1365, 1376, 1384, 1423, 1449, 1514, 1538, 1539, 1543, 1572, 1627, 1631, 165, 1668, 169, 1810, 1819, 1874, 1881, 1898, 1973, 1975, 1994, 2, 997, 995, 975, 963, 817, 813, 808, 807, 798, 761, 708, 695, 66, 657, 62, 617, 555, 526, 508, 50, 459, 442, 263, 247, 227, 217, 210'

In [41]:
# define how the data was split index-wise...
test_indices = list(range(0,401))
train_indices = list(range(401,2001))

def get_indices(data_str):

  # transform data into list of numbers
  data = data_str.split(', ')
  data = [int(num) for num in data]

  train_idx = []
  test_idx = []

  for num in data: # split numbers into train and test sets
    if num in test_indices:
      test_idx.append(num)
    else:
      train_idx.append(num)

  train_idx.sort() # just for visual purposes
  test_idx.sort()

  train_idx = [num-401 for num in train_idx] # change index to 0-1600 rather than 401-2000!

  return (train_idx, test_idx)

In [44]:
abuse_indices = get_indices(abuse)
withdrawl_indices = get_indices(withdrawl)
tolerance_indices = get_indices(tolerance)

In [56]:
def edit_cols(col, indices):
  for i in range(0, len(col)):
    if i in indices:
      col[i] = 1
  return(col)

In [53]:
test_abuse = [0]*len(test)
test_withdrawl = [0]*len(test)
test_tolerance = [0]*len(test)

train_abuse = [0]*len(train)
train_withdrawl = [0]*len(train)
train_tolerance = [0]*len(train)


In [60]:
train_abuse = edit_cols(train_abuse, abuse_indices[0])
train_withdrawl = edit_cols(train_withdrawl, withdrawl_indices[0])
train_tolerance = edit_cols(train_tolerance, tolerance_indices[0])

test_abuse = edit_cols(test_abuse, abuse_indices[1])
test_withdrawl = edit_cols(test_withdrawl, withdrawl_indices[1])
test_tolerance = edit_cols(test_tolerance, tolerance_indices[1])

In [61]:
train["Non_Medical_Use"] = train_abuse
train["Withdrawl"] = train_withdrawl
train["Tolerance"] = train_tolerance

test["Non_Medical_Use"] = test_abuse
test["Withdrawl"] = test_withdrawl
test["Tolerance"] = test_tolerance

In [62]:
train.head()

Unnamed: 0,text,Non_Medical_Use,Withdrawl,Tolerance
0,this might seem like a silly post - but i've o...,0,0,0
1,i've been off of it for over 3 years (personal...,1,0,0
2,i’m so confused! please has anyone felt this? ...,0,1,0
3,"i’m on 5mg, but will take 10mg when needed in ...",0,0,0
4,"hey all,\nso i recently got diagnosed with adh...",0,0,0


In [63]:
test.head()

Unnamed: 0,text,Non_Medical_Use,Withdrawl,Tolerance
0,i have one pill left. \ni should have gone las...,0,0,0
1,my health insurance doesn't really do anything...,0,0,0
2,\ni’ve been sick for a week so the next time i...,0,0,1
3,so i’m going to the doctors monday because i’v...,0,0,0
4,hi guys thanks for taking the time to read thi...,1,0,0


In [64]:
train.to_csv(wdir+"/TRAIN.csv")
test.to_csv(wdir+"/TEST.csv")

## Load Annotation Data from Brat & Save to Datasets...

In [72]:
train_loc = '/content/drive/MyDrive/BMI_6330-Natural_Language_Processing/brat_analysis/FINAL_TRAIN_1600'
test_loc = '/content/drive/MyDrive/BMI_6330-Natural_Language_Processing/heuristic_analysis/FINAL_TEST_400'

In [79]:
train_indices = list(range(401,2001))
train_entities = []

for idx in train_indices:
  entities, relations, attributes, groups = get_entities_relations_attributes_groups(train_loc+'/trainTN_'+str(idx)+'.ann')
  train_entities.append(entities)

In [89]:
test_indices = list(range(0,401))
test_entities = []

for idx in test_indices:
  entities, relations, attributes, groups = get_entities_relations_attributes_groups(test_loc+'/trainTN_'+str(idx)+'.ann')
  test_entities.append(entities)

In [87]:
train['Entities'] = train_entities
test['Entities'] = test_entities

In [88]:
train.head()

Unnamed: 0,text,Non_Medical_Use,Withdrawl,Tolerance,Entities
0,this might seem like a silly post - but i've o...,0,0,0,"{'T1': Entity(id='T1', type='ADHD', span=((499..."
1,i've been off of it for over 3 years (personal...,1,0,0,"{'T1': Entity(id='T1', type='Adderall', span=(..."
2,i’m so confused! please has anyone felt this? ...,0,1,0,{}
3,"i’m on 5mg, but will take 10mg when needed in ...",0,0,0,"{'T1': Entity(id='T1', type='Anxiety', span=((..."
4,"hey all,\nso i recently got diagnosed with adh...",0,0,0,"{'T1': Entity(id='T1', type='ADHD', span=((41,..."


In [90]:
test.head()

Unnamed: 0,text,Non_Medical_Use,Withdrawl,Tolerance,Entities
0,i have one pill left. \ni should have gone las...,0,0,0,{}
1,my health insurance doesn't really do anything...,0,0,0,{}
2,\ni’ve been sick for a week so the next time i...,0,0,1,"{'T1': Entity(id='T1', type='ADHD', span=((71,..."
3,so i’m going to the doctors monday because i’v...,0,0,0,"{'T1': Entity(id='T1', type='Adderall', span=(..."
4,hi guys thanks for taking the time to read thi...,1,0,0,"{'T1': Entity(id='T1', type='Adderall', span=(..."


In [91]:
train.to_csv(wdir+"/TRAIN.csv")
test.to_csv(wdir+"/TEST.csv")

## Analyze Data