In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
df_cc = pd.read_excel('cc61_df3_vn.xlsx')
df_xray = pd.read_excel('Xray (2017-2018).xlsx')

In [4]:
list(df_cc.columns)

['vn',
 'vstdate',
 'vsttime',
 'sex',
 'age_y',
 'cc',
 'bps',
 'bpd',
 'pulse',
 'bw',
 'height',
 'temperature',
 'department',
 'PDX (1)',
 'Comorbidity (2)',
 'Complication (3)',
 'Other (4)',
 'External Cause (5)',
 'clean_symtom(noun)',
 'clean_symptom2(synonym)']

In [None]:
# dropping null value columns to avoid errors
df_cc.dropna(subset=['clean_symptom2(synonym)', 'bps', 'bpd', 'pulse', 'bw', 'height', 'temperature'],inplace = True)
df_cc.head()

In [None]:
df_cc.drop(df_cc[df_cc.bw == 0].index, inplace=True)
df_cc.drop(df_cc[df_cc.height == 0].index, inplace=True)
df_cc.drop(df_cc[df_cc.temperature == 0].index, inplace=True)

In [None]:
list(df_cc.columns)

In [None]:
df_xray['xray_items'] = df_xray['xray_list'].str.split(",", n= -1)
df_xray.head()

In [None]:
df_cc['symptoms'] = df_cc['clean_symptom2(synonym)'].str.split("/", n= -1)
df_cc.head()

In [None]:
df_cc.rename(columns={'PDX (1)': 'PDX'}, inplace=True)

In [None]:
df_cc_new = df_cc[['vn', 'sex', 'age_y', 'bps', 'bpd', 'pulse', 'bw', 'height', 'temperature', 'PDX', 'symptoms']]
df_cc_new.head()

In [None]:
df_xray_new = df_xray[['vn', 'xray_items']]
df_xray_new.head()

In [35]:
df = pd.merge(df_cc_new, df_xray_new, how='inner', on='vn')
df.head()

Unnamed: 0,vn,sex,age_y,bps,bpd,pulse,bw,height,temperature,PDX,symptoms,xray_items
0,610926001048,หญิง,52.0,170.0,110.0,64.0,86.0,154.0,36.6,K30,"[ปวด, แน่น, ปวดท้อง, แน่นใต้ลิ้นปี่]",[Acute Abdomen]
1,601013203924,หญิง,51.0,150.0,94.0,74.0,86.0,156.0,36.4,K30,[เหนื่อย],[Chest PA Upright]
2,610330191455,หญิง,70.0,157.0,33.0,61.0,50.0,152.0,36.1,R1049,"[ปวด, ปวดเอว]","[LS-spine AP, lat, Acute Abdomen]"
3,610330191455,หญิง,70.0,157.0,33.0,61.0,50.0,152.0,36.1,R1049,"[ปวด, ปวดเอว]","[LS-spine AP, lat, Acute Abdomen]"
4,610627072254,หญิง,61.0,124.0,74.0,64.0,76.0,155.0,36.0,M70,"[ปวด, ปวดเข่า, เจ็บ, -]","[Knee Lt AP, lat standing, Knee Rt AP, lat sta..."


In [36]:
df.shape

(21611, 12)

In [37]:
df['symptoms'] = df.symptoms.apply("/".join)
df['xray_items'] = df.xray_items.apply("/".join)

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [39]:
# TFIDF - symptoms
def slash_token(msg):
    return msg.split('/')

In [40]:
type(df['symptoms'][1])

str

In [41]:
vectorizer = TfidfVectorizer(tokenizer=slash_token)
vectorizer.fit(df['symptoms'])
vector = vectorizer.transform(df['symptoms'])



In [42]:
vector.shape

(21611, 255)

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21611 entries, 0 to 21610
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   vn           21611 non-null  int64  
 1   sex          21611 non-null  object 
 2   age_y        21611 non-null  float64
 3   bps          21611 non-null  float64
 4   bpd          21611 non-null  float64
 5   pulse        21611 non-null  float64
 6   bw           21611 non-null  float64
 7   height       21611 non-null  float64
 8   temperature  21611 non-null  float64
 9   PDX          21586 non-null  object 
 10  symptoms     21611 non-null  object 
 11  xray_items   21611 non-null  object 
dtypes: float64(7), int64(1), object(4)
memory usage: 2.8+ MB


In [44]:
import scipy as sp

In [45]:
x = sp.sparse.hstack((vector,df[['age_y']].values),format='csr')

In [46]:
x.shape

(21611, 256)

In [55]:
df[['age_y']].values

array([[52.],
       [51.],
       [70.],
       ...,
       [ 5.],
       [14.],
       [ 7.]])

In [43]:
vector.shape

(21611, 255)

In [148]:
import numpy as np

In [152]:
symptoms_num = np.squeeze(np.asarray(vector))

In [138]:
symptoms_num = symptoms_num.transpose()

In [153]:
symptoms_num.shape

()

In [154]:
df.insert(11, 'symptoms_num', symptoms_num)

In [155]:
df.head()

Unnamed: 0,vn,sex,age_y,bps,bpd,pulse,bw,height,temperature,PDX,symptoms,symptoms_num,xray_items
0,610926001048,หญิง,52.0,170.0,110.0,64.0,86.0,154.0,36.6,K30,ปวด/แน่น/ปวดท้อง/แน่นใต้ลิ้นปี่,"(0, 229)\t0.727066710268303\n (0, 225)\t0.4...",Acute Abdomen
1,601013203924,หญิง,51.0,150.0,94.0,74.0,86.0,156.0,36.4,K30,เหนื่อย,"(0, 229)\t0.727066710268303\n (0, 225)\t0.4...",Chest PA Upright
2,610330191455,หญิง,70.0,157.0,33.0,61.0,50.0,152.0,36.1,R1049,ปวด/ปวดเอว,"(0, 229)\t0.727066710268303\n (0, 225)\t0.4...",LS-spine AP/lat/Acute Abdomen
3,610330191455,หญิง,70.0,157.0,33.0,61.0,50.0,152.0,36.1,R1049,ปวด/ปวดเอว,"(0, 229)\t0.727066710268303\n (0, 225)\t0.4...",LS-spine AP/lat/Acute Abdomen
4,610627072254,หญิง,61.0,124.0,74.0,64.0,76.0,155.0,36.0,M70,ปวด/ปวดเข่า/เจ็บ/-,"(0, 229)\t0.727066710268303\n (0, 225)\t0.4...",Knee Lt AP/lat standing/Knee Rt AP/lat standing


In [156]:
# vectorizer = TfidfVectorizer(tokenizer=slash_token)
vectorizer.fit(df['xray_items'])
vector_xray = vectorizer.transform(df['xray_items'])



In [157]:
xray_num = np.squeeze(np.asarray(vector_xray))

In [158]:
df.insert(13, 'xray_num', xray_num)

In [159]:
df.head()

Unnamed: 0,vn,sex,age_y,bps,bpd,pulse,bw,height,temperature,PDX,symptoms,symptoms_num,xray_items,xray_num
0,610926001048,หญิง,52.0,170.0,110.0,64.0,86.0,154.0,36.6,K30,ปวด/แน่น/ปวดท้อง/แน่นใต้ลิ้นปี่,"(0, 229)\t0.727066710268303\n (0, 225)\t0.4...",Acute Abdomen,"(0, 5)\t1.0\n (1, 24)\t1.0\n (2, 65)\t0.61..."
1,601013203924,หญิง,51.0,150.0,94.0,74.0,86.0,156.0,36.4,K30,เหนื่อย,"(0, 229)\t0.727066710268303\n (0, 225)\t0.4...",Chest PA Upright,"(0, 5)\t1.0\n (1, 24)\t1.0\n (2, 65)\t0.61..."
2,610330191455,หญิง,70.0,157.0,33.0,61.0,50.0,152.0,36.1,R1049,ปวด/ปวดเอว,"(0, 229)\t0.727066710268303\n (0, 225)\t0.4...",LS-spine AP/lat/Acute Abdomen,"(0, 5)\t1.0\n (1, 24)\t1.0\n (2, 65)\t0.61..."
3,610330191455,หญิง,70.0,157.0,33.0,61.0,50.0,152.0,36.1,R1049,ปวด/ปวดเอว,"(0, 229)\t0.727066710268303\n (0, 225)\t0.4...",LS-spine AP/lat/Acute Abdomen,"(0, 5)\t1.0\n (1, 24)\t1.0\n (2, 65)\t0.61..."
4,610627072254,หญิง,61.0,124.0,74.0,64.0,76.0,155.0,36.0,M70,ปวด/ปวดเข่า/เจ็บ/-,"(0, 229)\t0.727066710268303\n (0, 225)\t0.4...",Knee Lt AP/lat standing/Knee Rt AP/lat standing,"(0, 5)\t1.0\n (1, 24)\t1.0\n (2, 65)\t0.61..."


In [160]:
df.drop(['vn', 'sex', 'symptoms', 'xray_items'], axis = 1, inplace=True) 

In [161]:
# Split data into train and test
y = df.PDX
x = df.drop('PDX',axis=1)
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
x_train.head()

Unnamed: 0,age_y,bps,bpd,pulse,bw,height,temperature,symptoms_num,xray_num
11807,55.0,142.0,80.0,76.0,80.0,165.0,36.2,"(0, 229)\t0.727066710268303\n (0, 225)\t0.4...","(0, 5)\t1.0\n (1, 24)\t1.0\n (2, 65)\t0.61..."
19624,0.0,0.0,0.0,0.0,7.6,71.0,37.6,"(0, 229)\t0.727066710268303\n (0, 225)\t0.4...","(0, 5)\t1.0\n (1, 24)\t1.0\n (2, 65)\t0.61..."
4587,94.0,110.0,60.0,90.0,50.0,148.0,37.5,"(0, 229)\t0.727066710268303\n (0, 225)\t0.4...","(0, 5)\t1.0\n (1, 24)\t1.0\n (2, 65)\t0.61..."
9113,60.0,150.0,90.0,76.0,46.4,157.0,37.0,"(0, 229)\t0.727066710268303\n (0, 225)\t0.4...","(0, 5)\t1.0\n (1, 24)\t1.0\n (2, 65)\t0.61..."
7957,85.0,130.0,70.0,126.0,56.0,163.0,36.8,"(0, 229)\t0.727066710268303\n (0, 225)\t0.4...","(0, 5)\t1.0\n (1, 24)\t1.0\n (2, 65)\t0.61..."


In [162]:
x_train.shape

(17288, 9)

In [163]:
type(symptoms_num)

numpy.ndarray

In [164]:
x_test.shape

(4323, 9)

In [172]:
from sklearn.svm import SVC

In [173]:
svclassifier = SVC()

In [174]:
svclassifier.fit(x_train, y_train)

ValueError: setting an array element with a sequence.

In [119]:
x_train.values

array([[14.0, 126.0, 74.0, ..., 37.3,
        <21611x255 sparse matrix of type '<class 'numpy.float64'>'
	with 67732 stored elements in Compressed Sparse Row format>,
        <21611x160 sparse matrix of type '<class 'numpy.float64'>'
	with 42227 stored elements in Compressed Sparse Row format>],
       [64.0, 129.0, 78.0, ..., 36.5,
        <21611x255 sparse matrix of type '<class 'numpy.float64'>'
	with 67732 stored elements in Compressed Sparse Row format>,
        <21611x160 sparse matrix of type '<class 'numpy.float64'>'
	with 42227 stored elements in Compressed Sparse Row format>],
       [25.0, 112.0, 80.0, ..., 36.8,
        <21611x255 sparse matrix of type '<class 'numpy.float64'>'
	with 67732 stored elements in Compressed Sparse Row format>,
        <21611x160 sparse matrix of type '<class 'numpy.float64'>'
	with 42227 stored elements in Compressed Sparse Row format>],
       ...,
       [59.0, 120.0, 70.0, ..., 36.5,
        <21611x255 sparse matrix of type '<class 'numpy.flo

In [170]:
from sklearn_pandas import DataFrameMapper

In [177]:
x_train_new = mapper.fit_transform(x_train)

TypeError: _build_feature() takes from 2 to 3 positional arguments but 5 were given

In [176]:
mapper = DataFrameMapper(['age_y', 'bps', 'bpd', 'pulse', 'bw', 'height', 'temperature', 'symptoms_num', 'xray_num'], None)