# MIMIC III Feature Extraction

## Data Loading

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *
import pandas as pd

In [2]:
import pickle
ICD9CODES = pickle.load(open("./data/ICD9CODES.p", "r"))

df_hadm_top10 = pd.read_csv("./data/DATA_HADM_TOP50.csv", escapechar='\\')
df_hadm_top10_2 = spark.read.csv("./data/DATA_HADM_TOP50.csv", header=True, inferSchema=True)
print df_hadm_top10.head()
df_hadm_top10_2.show()

df_hadm_top10cat = pd.read_csv("./data/DATA_HADM_TOP10CAT.csv", escapechar='\\')
df_hadm_top10cat_2 = spark.read.csv("./data/DATA_HADM_TOP10CAT.csv", header=True, inferSchema=True)
print df_hadm_top10cat.head()
df_hadm_top10cat_2.show()

       id  V4582  41071  32723  311  5119  4240  41401  42789  V4581  \
0  117760      0      0      0    0     0     0      0      0      0   
1  129030      0      0      0    0     0     0      0      0      0   
2  172040      0      1      0    0     0     0      1      0      0   
3  156170      0      0      0    0     0     0      0      1      1   
4  199180      0      0      0    0     0     0      1      0      0   

                         ...                          V290  3051  4019  7742  \
0                        ...                             0     0     0     0   
1                        ...                             0     0     1     0   
2                        ...                             0     0     0     0   
3                        ...                             0     0     0     0   
4                        ...                             0     0     0     0   

   412  42731  5859  51881  78552  \
0    0      0     0      1      0   
1    0      

Name: org.apache.toree.interpreter.broker.BrokerException
Message: Traceback (most recent call last):
  File "/tmp/kernel-PySpark-72cb7619-783d-4a4b-8195-d13f2d4387eb/pyspark_runner.py", line 189, in <module>
    eval(compiled_code)
  File "<string>", line 7, in <module>
  File "/home/docker-user/anaconda2/envs/cse6250/lib/python2.7/site-packages/pandas/io/parsers.py", line 646, in parser_f
    return _read(filepath_or_buffer, kwds)
  File "/home/docker-user/anaconda2/envs/cse6250/lib/python2.7/site-packages/pandas/io/parsers.py", line 389, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "/home/docker-user/anaconda2/envs/cse6250/lib/python2.7/site-packages/pandas/io/parsers.py", line 730, in __init__
    self._make_engine(self.engine)
  File "/home/docker-user/anaconda2/envs/cse6250/lib/python2.7/site-packages/pandas/io/parsers.py", line 923, in _make_engine
    self._engine = CParserWrapper(self.f, **self.options)
  File "/home/docker-user/anaconda2/envs/cse625

## Feature Extraction

### TF-IDF (using pyspark)

In [3]:
from pyspark.ml.feature import StopWordsRemover
STOPWORDS_v0 = StopWordsRemover.loadDefaultStopWords("english") + ICD9CODES
STOPWORDS_v0 = [str(i) for i in STOPWORDS_v0]

# print "TFIDF v0 stop words"
# print STOPWORDS_v0

In [4]:
from pyspark.ml.feature import HashingTF, IDF, RegexTokenizer, StopWordsRemover

def create_TFIDF_v0(sentenceData, inputCol="text", outputCol="features", minDocFreq=3, numFeatures=20):    
    tokenizer = RegexTokenizer(pattern="[.:\s]+", inputCol=inputCol, outputCol="z_words")
    wordsData = tokenizer.transform(sentenceData)
    
    remover = StopWordsRemover(inputCol="z_words", outputCol="z_filtered", stopWords=STOPWORDS_v0)
    wordsDataFiltered = remover.transform(wordsData)
    
    hashingTF = HashingTF(inputCol="z_filtered", outputCol="z_rawFeatures", numFeatures=numFeatures)
    featurizedData = hashingTF.transform(wordsDataFiltered)
    # alternatively, CountVectorizer can also be used to get term frequency vectors

    idf = IDF(inputCol="z_rawFeatures", outputCol=outputCol, minDocFreq=minDocFreq)
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    return rescaledData.drop("z_words", "z_filtered", "z_rawFeatures", inputCol)

### TF-IDF (using sklearn)

In [5]:
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

STOPWORDS_v1 = list(ENGLISH_STOP_WORDS) + ICD9CODES

# print "TFIDF v1 stop words"
# print STOPWORDS_v1

In [6]:
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from pyspark.mllib.util import Vectors
from pyspark.mllib.linalg import VectorUDT
from pyspark.sql.functions import UserDefinedFunction

def preprocessor_v1(text):
    text = re.sub('\[\*\*[^\]]*\*\*\]', '', text)
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('[\W]+', ' ', text.lower()) 
    return text

def create_TFIDF_v1(df, inputCol="text", outputCol="features",
                    minDocFreq=3, maxDocFreq=1.0, numFeatures=20):
    df['z_cleaned'] = df[inputCol].apply(preprocessor_v1)

    # Now we create the sparse matrix of tfidf values
    tfidf = TfidfVectorizer(input='content',ngram_range=(1, 1),
                            stop_words=STOPWORDS_v1, 
                            min_df=minDocFreq,
                            max_df=maxDocFreq,
                            max_features=numFeatures)
    # I select to remove stopwords and minimun doc frequency =10 to delete very unusual words
    # that only show up in less than 10 notes (out of 59k notes available) 

    dtm = tfidf.fit_transform([c for c in df['z_cleaned']]).tocsr()
    dtm.sort_indices()
    df[outputCol] = list(dtm)
   
    del df['z_cleaned']
    del df[inputCol]

    return df

### WORD2VEC
Note: This code only converts the data using a pre-trained word2vec model

In [17]:
from nltk.corpus import stopwords

STOPWORDS_WORD2VEC = stopwords.words('english') + ICD9CODES

# print "WORD2VEC stop words"
# print STOPWORDS_WORD2VEC

In [18]:
import numpy as np
import re

# Run this cell if you are using Glove type format
def loadGloveModel(gloveFile):
    print("Loading Glove Model")
    f = open(gloveFile,'r')
    model = {}
    for line in f:
        splitLine = line.split()
        word = splitLine[0]
        embedding = [float(val) for val in splitLine[1:]]
        model[word] = embedding
    print("Done.",len(model)," words loaded!")
    return model

model_w2v = loadGloveModel("./data/model_word2vec.txt")

def preprocessor_word2vec(text):
    text = re.sub('\[\*\*[^\]]*\*\*\]', '', text)
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('[\W]+', ' ', text.lower()) 
    text = re.sub(" \d+", " ", text)
    #text = gensim.parsing.preprocessing.remove_stopwords(text)
    return text

keys_updated = [word for word in model_w2v.keys() if word not in STOPWORDS_WORD2VEC]
index2word_set=set(keys_updated)

def makeFeatureVec(words, model, num_features):
    # Function to average all of the word vectors in a given
    # paragraph
    #
    # Pre-initialize an empty numpy array (for speed)
    featureVec = np.zeros((num_features,),dtype="float32")
    #
    nwords = 0.
    # 
    # Index2word is a list that contains the names of the words in 
    # the model's vocabulary. Convert it to a set, for speed 
    #index2word_set = set(model.wv.index2word) #activate if using gensim

    # activate if uploaded text version
    #index2word_set=set(keys_updated)
    
    
    #
    # Loop over each word in the review and, if it is in the model's
    # vocaublary, add its feature vector to the total
    for word in words:
        if word in index2word_set: 
            nwords = nwords + 1.
            featureVec = np.add(featureVec,model[word])
    # 
    # Divide the result by the number of words to get the average
    featureVec = np.divide(featureVec,nwords)
    return featureVec

def getAvgFeatureVecs(reviews, model, num_features):
    # Given a set of reviews (each one a list of words), calculate 
    # the average feature vector for each one and return a 2D numpy array 
    # 
    # Initialize a counter
    counter = 0
    # 
    # Preallocate a 2D numpy array, for speed
    reviewFeatureVecs = np.zeros((len(reviews),num_features),dtype="float32")
    # 
    # Loop through the reviews
    for review in reviews:
       #
       # Print a status message every 10000th review
       if counter%10000 == 0:
           print "Review %d of %d" % (counter, len(reviews))
       # 
       # Call the function (defined above) that makes average feature vectors
       reviewFeatureVecs[counter] = makeFeatureVec(review, model,num_features)
       #
       # Increment the counter
       counter = counter + 1
    return reviewFeatureVecs

def create_WORD2VEC(df, inputCol="text", outputCol="features",
                    numFeatures=20):
    df['z_cleaned'] = df[inputCol].apply(preprocessor_word2vec)
    
    # Create tokens
    token_review=[]
    for i in range(df['z_cleaned'].shape[0]):
        review = df['z_cleaned'][i]
        token_review.append([i for i in review.split()])
        
    final_w2v = getAvgFeatureVecs(token_review, model_w2v, num_features=numFeatures)
    df[outputCol] = list(final_w2v)
    
    del df['z_cleaned']
    del df[inputCol]
    
    return df

Loading Glove Model
('Done.', 33837, ' words loaded!')


### Helper Functions

In [9]:
import random
import pandas as pd
from pyspark.mllib.util import Vectors
from pyspark.mllib.linalg import VectorUDT
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import StringType

def separate(seed, N):    
    idx=list(range(N))
    random.seed(seed)
    random.shuffle(idx)
    idx_train= idx[0:int(N*0.50)]
    idx_val= idx[int(N*0.50):int(N*0.75)]
    idx_test= idx[int(N*0.75):N]

    return idx_train, idx_val, idx_test

def output_csv(df, path, col='features', dense=False):
    if type(df) != pd.DataFrame:       
        udf = UserDefinedFunction(lambda x: Vectors.stringify(x), StringType())
        df2 = df.withColumn(col, udf(df[col]))
        df2.write.csv(path, header=True)
    else:
        N = df[col].iloc[0].shape[-1]
        if dense:
            def to_string(x):
                return "({0},[{1}],[{2}])".format(N, 
                                                  ",".join([str(i) for i in xrange(N)]),
                                                  ",".join([str(i) for i in x.tolist()]))
        else:            
            def to_string(x):
                return "({0},[{1}],[{2}])".format(N, 
                                      ",".join([str(i) for i in x.indices.tolist()]),
                                      ",".join([str(i) for i in x.data.tolist()]))
        df2 = df.copy()
        df2[col] = df[col].apply(to_string)
        df2.to_csv(path, index=False)
        
def read_csv(path):
    df = spark.read.csv(path, header=True, inferSchema=True)
    
    udf = UserDefinedFunction(lambda x: Vectors.parse(x), VectorUDT())
    new_df = df.withColumn('features', udf(df.features))
    
    return new_df

### Actual Data Preprocessing (HADM TOP 10)
prepare separation indices (train, validation, test)

In [10]:
idx_train, idx_val, idx_test = separate(1234, df_hadm_top10.shape[0])

TFIDF v1

In [11]:
from time import time
t0 = time()

df_id2tfidfv1 = create_TFIDF_v1(df_hadm_top10.copy(), minDocFreq=10, 
                                maxDocFreq=0.8, numFeatures=40000)

print df_id2tfidfv1.head()
output_csv(df_id2tfidfv1.loc[idx_train], "./data/DATA_TFIDFV1_HADM_TOP50_train.csv", dense=False)
output_csv(df_id2tfidfv1.loc[idx_val], "./data/DATA_TFIDFV1_HADM_TOP50_val.csv", dense=False)
output_csv(df_id2tfidfv1.loc[idx_test], "./data/DATA_TFIDFV1_HADM_TOP50_test.csv", dense=False)

elapsed=time() - t0
print("Run Time: ", elapsed, "seconds.")

       id  V4582  41071  32723  311  5119  4240  41401  42789  V4581  \
0  117760      0      0      0    0     0     0      0      0      0   
1  129030      0      0      0    0     0     0      0      0      0   
2  172040      0      1      0    0     0     0      1      0      0   
3  156170      0      0      0    0     0     0      0      1      1   
4  199180      0      0      0    0     0     0      1      0      0   

                         ...                          V290  3051  4019  7742  \
0                        ...                             0     0     0     0   
1                        ...                             0     0     1     0   
2                        ...                             0     0     0     0   
3                        ...                             0     0     0     0   
4                        ...                             0     0     0     0   

   412  42731  5859  51881  78552  \
0    0      0     0      1      0   
1    0      

WORD2VEC v0

In [19]:
from time import time
t0 = time()

df_id2word2vec = create_WORD2VEC(df_hadm_top10.copy(), numFeatures=100)

print df_id2word2vec.head()
output_csv(df_id2word2vec.loc[idx_train], "./data/DATA_WORD2VEC_HADM_TOP50_train.csv", dense=True)
output_csv(df_id2word2vec.loc[idx_val], "./data/DATA_WORD2VEC_HADM_TOP50_val.csv", dense=True)
output_csv(df_id2word2vec.loc[idx_test], "./data/DATA_WORD2VEC_HADM_TOP50_test.csv", dense=True)

elapsed=time() - t0
print("Run Time: ", elapsed, "seconds.")

Review 0 of 52726
Review 10000 of 52726
Review 20000 of 52726
Review 30000 of 52726
Review 40000 of 52726
Review 50000 of 52726
       id  V4582  41071  32723  311  5119  4240  41401  42789  V4581  \
0  117760      0      0      0    0     0     0      0      0      0   
1  129030      0      0      0    0     0     0      0      0      0   
2  172040      0      1      0    0     0     0      1      0      0   
3  156170      0      0      0    0     0     0      0      1      1   
4  199180      0      0      0    0     0     0      1      0      0   

                         ...                          V290  3051  4019  7742  \
0                        ...                             0     0     0     0   
1                        ...                             0     0     1     0   
2                        ...                             0     0     0     0   
3                        ...                             0     0     0     0   
4                        ...           

TFIDF v0

In [13]:
train_id_set = zip(df_id2tfidfv1.loc[idx_train]['id'].tolist())
spark.createDataFrame(train_id_set, ['id2']).registerTempTable("train_id_set")

val_id_set = zip(df_id2tfidfv1.loc[idx_val]['id'].tolist())
spark.createDataFrame(val_id_set, ['id2']).registerTempTable("val_id_set")

test_id_set = zip(df_id2tfidfv1.loc[idx_test]['id'].tolist())
spark.createDataFrame(test_id_set, ['id2']).registerTempTable("test_id_set")

In [14]:
from time import time
t0 = time()

df_id2tfidfv0 = create_TFIDF_v0(df_hadm_top10_2, numFeatures=40000)
df_id2tfidfv0.registerTempTable("df_id2tfidfv0")
df_id2tfidfv0.cache()

print df_id2tfidfv0.dtypes
df_id2tfidfv0.show()

df = spark.sql("""SELECT * FROM df_id2tfidfv0 JOIN train_id_set 
ON df_id2tfidfv0.id = train_id_set.id2""").drop('id2')
output_csv(df, "./data/DATA_TFIDFV0_HADM_TOP50_train")

df = spark.sql("""SELECT * FROM df_id2tfidfv0 JOIN val_id_set 
ON df_id2tfidfv0.id = val_id_set.id2""").drop('id2')
output_csv(df, "./data/DATA_TFIDFV0_HADM_TOP50_val")

df = spark.sql("""SELECT * FROM df_id2tfidfv0 JOIN test_id_set 
ON df_id2tfidfv0.id = test_id_set.id2""").drop('id2')
output_csv(df, "./data/DATA_TFIDFV0_HADM_TOP50_test")

elapsed=time() - t0
print("Run Time: ", elapsed, "seconds.")

[('id', 'int'), ('V4582', 'int'), ('41071', 'int'), ('32723', 'int'), ('311', 'int'), ('5119', 'int'), ('4240', 'int'), ('41401', 'int'), ('42789', 'int'), ('V4581', 'int'), ('53081', 'int'), ('496', 'int'), ('5070', 'int'), ('V053', 'int'), ('40391', 'int'), ('40390', 'int'), ('4280', 'int'), ('V1582', 'int'), ('99592', 'int'), ('4241', 'int'), ('49390', 'int'), ('4168', 'int'), ('5845', 'int'), ('9971', 'int'), ('5849', 'int'), ('2859', 'int'), ('2724', 'int'), ('25000', 'int'), ('2875', 'int'), ('2720', 'int'), ('2851', 'int'), ('2762', 'int'), ('2761', 'int'), ('2449', 'int'), ('2767', 'int'), ('5180', 'int'), ('0389', 'int'), ('V5867', 'int'), ('5990', 'int'), ('2760', 'int'), ('V5861', 'int'), ('486', 'int'), ('V290', 'int'), ('3051', 'int'), ('4019', 'int'), ('7742', 'int'), ('412', 'int'), ('42731', 'int'), ('5859', 'int'), ('51881', 'int'), ('78552', 'int'), ('features', 'vector')]
+------+-----+-----+-----+---+----+----+-----+-----+-----+-----+---+----+----+-----+-----+----+-

### Actual Data Preprocessing (HADM TOP 10 Category)
prepare separation indices (train, validation, test)

In [15]:
idx_train, idx_val, idx_test = separate(1234, df_hadm_top10cat.shape[0])

TFIDF v1

In [16]:
from time import time
t0 = time()

df_id2tfidfv1 = create_TFIDF_v1(df_hadm_top10cat.copy(), minDocFreq=10, 
                                maxDocFreq=0.8, numFeatures=40000)

print df_id2tfidfv1.head()
output_csv(df_id2tfidfv1.loc[idx_train], "./data/DATA_TFIDFV1_HADM_TOP10CAT_train.csv", dense=False)
output_csv(df_id2tfidfv1.loc[idx_val], "./data/DATA_TFIDFV1_HADM_TOP10CAT_val.csv", dense=False)
output_csv(df_id2tfidfv1.loc[idx_test], "./data/DATA_TFIDFV1_HADM_TOP10CAT_test.csv", dense=False)

elapsed=time() - t0
print("Run Time: ", elapsed, "seconds.")

       id  584  276  518  414  428  272  401  250  285  427  \
0  117760    0    0    1    0    0    0    0    0    0    0   
1  129030    0    0    0    0    0    1    1    0    1    0   
2  172040    1    1    0    1    0    0    0    0    0    0   
3  156170    1    0    0    0    1    0    0    1    1    1   
4  199180    0    0    0    1    1    0    0    1    0    0   

                                            features  
0    (0, 36)\t0.055604349814\n  (0, 52)\t0.059016...  
1    (0, 0)\t0.022399917968\n  (0, 1)\t0.02067483...  
2    (0, 1)\t0.0147394205176\n  (0, 17)\t0.020101...  
3    (0, 0)\t0.0172794691052\n  (0, 11)\t0.032649...  
4    (0, 127)\t0.0311873506267\n  (0, 128)\t0.044...  
('Run Time: ', 119.59498310089111, 'seconds.')


WORD2VEC v0

In [17]:
from time import time
t0 = time()

df_id2word2vec = create_WORD2VEC(df_hadm_top10cat.copy(), numFeatures=100)

print df_id2word2vec.head()
output_csv(df_id2word2vec.loc[idx_train], "./data/DATA_WORD2VEC_HADM_TOP10CAT_train.csv", dense=True)
output_csv(df_id2word2vec.loc[idx_val], "./data/DATA_WORD2VEC_HADM_TOP10CAT_val.csv", dense=True)
output_csv(df_id2word2vec.loc[idx_test], "./data/DATA_WORD2VEC_HADM_TOP10CAT_test.csv", dense=True)

elapsed=time() - t0
print("Run Time: ", elapsed, "seconds.")

Review 0 of 52726
Review 10000 of 52726
Review 20000 of 52726
Review 30000 of 52726
Review 40000 of 52726Review 50000 of 52726
       id  584  276  518  414  428  272  401  250  285  427  \
0  117760    0    0    1    0    0    0    0    0    0    0   
1  129030    0    0    0    0    0    1    1    0    1    0   
2  172040    1    1    0    1    0    0    0    0    0    0   
3  156170    1    0    0    0    1    0    0    1    1    1   
4  199180    0    0    0    1    1    0    0    1    0    0   

                                            features  
0  [0.357326, 0.424179, -0.694198, -0.0219117, 0....  
1  [0.124697, -0.741666, -0.0136118, 0.190457, -0...  
2  [-0.328929, -0.804951, -0.392188, -0.173777, 0...  
3  [-0.349238, -0.241747, -0.0182305, 0.292135, -...  
4  [0.149872, -0.0304227, -0.374879, -0.3131, 0.4...  
('Run Time: ', 408.28796219825745, 'seconds.')


TFIDF v0

In [18]:
train_id_set = zip(df_id2tfidfv1.loc[idx_train]['id'].tolist())
spark.createDataFrame(train_id_set, ['id2']).registerTempTable("train_id_set")

val_id_set = zip(df_id2tfidfv1.loc[idx_val]['id'].tolist())
spark.createDataFrame(val_id_set, ['id2']).registerTempTable("val_id_set")

test_id_set = zip(df_id2tfidfv1.loc[idx_test]['id'].tolist())
spark.createDataFrame(test_id_set, ['id2']).registerTempTable("test_id_set")

In [19]:
from time import time
t0 = time()

df_id2tfidfv0 = create_TFIDF_v0(df_hadm_top10cat_2, numFeatures=40000)
df_id2tfidfv0.registerTempTable("df_id2tfidfv0")
df_id2tfidfv0.cache()

print df_id2tfidfv0.dtypes
df_id2tfidfv0.show()

df = spark.sql("""SELECT * FROM df_id2tfidfv0 JOIN train_id_set 
ON df_id2tfidfv0.id = train_id_set.id2""").drop('id2')
output_csv(df, "./data/DATA_TFIDFV0_HADM_TOP10CAT_train")

df = spark.sql("""SELECT * FROM df_id2tfidfv0 JOIN val_id_set 
ON df_id2tfidfv0.id = val_id_set.id2""").drop('id2')
output_csv(df, "./data/DATA_TFIDFV0_HADM_TOP10CAT_val")

df = spark.sql("""SELECT * FROM df_id2tfidfv0 JOIN test_id_set 
ON df_id2tfidfv0.id = test_id_set.id2""").drop('id2')
output_csv(df, "./data/DATA_TFIDFV0_HADM_TOP10CAT_test")

elapsed=time() - t0
print("Run Time: ", elapsed, "seconds.")

[('id', 'int'), ('584', 'int'), ('276', 'int'), ('518', 'int'), ('414', 'int'), ('428', 'int'), ('272', 'int'), ('401', 'int'), ('250', 'int'), ('285', 'int'), ('427', 'int'), ('features', 'vector')]
+------+---+---+---+---+---+---+---+---+---+---+--------------------+
|    id|584|276|518|414|428|272|401|250|285|427|            features|
+------+---+---+---+---+---+---+---+---+---+---+--------------------+
|117760|  0|  0|  1|  0|  0|  0|  0|  0|  0|  0|(40000,[69,372,69...|
|129030|  0|  0|  0|  0|  0|  1|  1|  0|  1|  0|(40000,[13,32,83,...|
|172040|  1|  1|  0|  1|  0|  0|  0|  0|  0|  0|(40000,[10,69,152...|
|156170|  1|  0|  0|  0|  1|  0|  0|  1|  1|  1|(40000,[3,78,138,...|
|199180|  0|  0|  0|  1|  1|  0|  0|  1|  0|  0|(40000,[48,62,80,...|
|167440|  0|  0|  1|  1|  0|  1|  0|  1|  0|  0|(40000,[207,264,2...|
|194580|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|(40000,[89,187,27...|
|178710|  1|  1|  0|  0|  0|  0|  0|  0|  0|  0|(40000,[574,794,1...|
|162840|  0|  0|  0|  1|  1|  

### Test (basic only)

[Test] Load csv file
count should be the same with the sql query

In [20]:
tests = ["./data/DATA_TFIDFV0_HADM_TOP10",
        "./data/DATA_TFIDFV1_HADM_TOP10",
        "./data/DATA_WORD2VEC_HADM_TOP10",
        "./data/DATA_TFIDFV0_HADM_TOP10CAT",
        "./data/DATA_TFIDFV1_HADM_TOP10CAT",
        "./data/DATA_WORD2VEC_HADM_TOP10CAT"]

for append in ["_train.csv", "_val.csv", "_test.csv"]:
    for folder in tests:
        fname = folder+append
        testdf = read_csv(fname)
        print fname
        print testdf.count()
        testdf.show()

./data/DATA_TFIDFV0_HADM_TOP10_train.csv
26363
+------+----+----+-----+----+-----+-----+-----+-----+----+----+--------------------+
|    id|4019|2724|25000|4280|41401|53081|51881|42731|5849|5990|            features|
+------+----+----+-----+----+-----+-----+-----+-----+----+----+--------------------+
|100852|   0|   0|    0|   0|    0|    0|    0|    0|   0|   0|(40000,[159,205,2...|
|101011|   0|   0|    0|   0|    0|    0|    1|    0|   0|   0|(40000,[695,794,1...|
|101496|   1|   0|    0|   1|    0|    0|    0|    1|   0|   0|(40000,[115,794,8...|
|101552|   0|   0|    0|   0|    0|    0|    0|    0|   0|   0|(40000,[157,273,2...|
|102561|   1|   0|    1|   0|    1|    0|    0|    0|   0|   0|(40000,[197,218,5...|
|102613|   0|   0|    0|   0|    0|    0|    0|    0|   1|   0|(40000,[10,115,20...|
|103496|   0|   0|    0|   0|    0|    0|    0|    0|   1|   0|(40000,[794,1439,...|
|103854|   0|   0|    0|   0|    0|    0|    0|    0|   1|   0|(40000,[78,152,18...|
|104084|   0|   0|

In [21]:
#sc.stop()
print "Done!"

Done!
