# Machine Learning Base Line

## Initialization

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *

conf = SparkConf().setAppName("preprocess").setMaster("local")
sc = SparkContext.getOrCreate(conf)
spark = SparkSession.builder.master("local").appName("preprocess").getOrCreate()

### Load and Split Data

In [2]:
from pyspark.mllib.util import Vectors, MLUtils
from pyspark.mllib.linalg import VectorUDT
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import DataType, StringType

def output_csv(df, path):
    udf = UserDefinedFunction(lambda x: Vectors.stringify(x), StringType())
    new_df = df.withColumn('features', udf(df.features))
    
    new_df.write.csv(path, header=True)
    
def read_csv(path):
    df = spark.read.csv(path, header=True, inferSchema=True)
    
    udf = UserDefinedFunction(lambda x: Vectors.parse(x), VectorUDT())
    # https://spark.apache.org/docs/latest/ml-migration-guides.html
    new_df = MLUtils.convertVectorColumnsToML(df.withColumn('features', udf(df.features)))
    
    return new_df

### Evaluator

In [3]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import StringType, IntegerType
import pyspark.sql.functions as F

concat_udf = F.udf(lambda cols: float(int("".join([str(int(x)) for x in cols]), 2)), DoubleType())

def evaluate(df, labelCols):
    labelCols2 = [i+"_pred" for i in labelCols]
    df.cache()
    
    r_list = {i: [] for i in ['accuracy', 'precision', 'recall', 'fmeasure']}
    for i in xrange(len(labelCols)):
        predandlabels = df.select(labelCols2[i], labelCols[i]).rdd \
                        .map(lambda x: (float(x[labelCols2[i]]), float(x[labelCols[i]])))
        metrics = MulticlassMetrics(predandlabels)

        # print metrics.confusionMatrix()
        r_list['accuracy'].append(metrics.accuracy)
        r_list['precision'].append(metrics.precision(1.0))
        r_list['recall'].append(metrics.recall(1.0))
        r_list['fmeasure'].append(metrics.fMeasure(label=1.0))

    results = {m: (sum(rs) / len(rs)) for (m, rs) in r_list.iteritems()}
            
    return results

def evaluate_em(df, labelCols, metrics=["f1", "weightedPrecision", "weightedRecall", "accuracy"]):
    evaluator = MulticlassClassificationEvaluator()
    labelCols2 = [i+"_pred" for i in labelCols]
    df2 = df.withColumn("_label", concat_udf(F.array(labelCols)))
    df2 = df2.withColumn("_pred", concat_udf(F.array(labelCols2)))
    
    output = {}
    for m in metrics:
        result = evaluator.evaluate(df2, {evaluator.metricName: m,
                                         evaluator.predictionCol: "_pred",
                                         evaluator.labelCol: "_label"})
        output[m] = result
        
    return output

## Logistic Regression

Define our custom Logistic Regression class

In [4]:
from pyspark.ml.classification import LogisticRegression

class CustomLogisticRegression:
    def __init__(self):
        pass
    
    def fit(self, df, maxIter=100, regParam=0.0, featuresCol="features", ignoreCols=["id"]):
        self.featuresCol = featuresCol
        self.labelCols = df.columns
        self.labelCols.remove("features")
        for c in ignoreCols:
            self.labelCols.remove(c)
        self.models = []
        
        for c in self.labelCols:
            lr = LogisticRegression(featuresCol=featuresCol,
                                    labelCol=c,
                                    predictionCol=c+"_pred",
                                    probabilityCol=c+"_prob",
                                    rawPredictionCol=c+"_rpred",
                                    maxIter=maxIter,
                                    regParam=regParam,
                                    family="binomial")
            model = lr.fit(df)
            self.models.append(model)
            
    def predict(self, df):
        df_out = df
        for c, m in zip(self.labelCols, self.models):
            df_out = m.transform(df_out)
            
        return df_out
        
        

## Random Forest

Define our custom Logistic Regression class

In [5]:
from pyspark.ml.classification import RandomForestClassifier

class CustomRandomForestClassifier:
    def __init__(self):
        pass
    
    def fit(self, df, maxDepth=5, maxBins=32, numTrees=20, regParam=0.0, featuresCol="features", ignoreCols=["id"]):
        self.featuresCol = featuresCol
        self.labelCols = df.columns
        self.labelCols.remove("features")
        for c in ignoreCols:
            self.labelCols.remove(c)
        self.models = []
        
        for c in self.labelCols:
            lr = RandomForestClassifier(featuresCol=featuresCol,
                                        labelCol=c,
                                        predictionCol=c+"_pred",
                                        probabilityCol=c+"_prob",
                                        rawPredictionCol=c+"_rpred",
                                        maxDepth=maxDepth,
                                        maxBins=maxBins,
                                        impurity="gini",
                                        numTrees=numTrees,
                                        seed=None)
            model = lr.fit(df)
            self.models.append(model)
            
    def predict(self, df):
        df_out = df
        for c, m in zip(self.labelCols, self.models):
            df_out = m.transform(df_out)
            
        return df_out

### Run Experiments

In [6]:
def print_latex(inum, m1, m2, m3, m4):
    r1 = "{precision:.4f} & {recall:.4f} & {fmeasure:.4f} & {accuracy:.4f}".format(**m1)
    r2 = "{precision:.4f} & {recall:.4f} & {fmeasure:.4f} & {accuracy:.4f}".format(**m2)
    r3 = "{accuracy:.4f}".format(**m3)
    r4 = "{accuracy:.4f}".format(**m4)
    return "{0} & {1} & {2} & {3} & {4} \\\\ \hline".format(inum, r1, r3, r2, r4)
    
def run_experiment(input_name):
    df_train = read_csv("{0}_train.csv".format(input_name))
    df_val = read_csv("{0}_val.csv".format(input_name))
    df_test = read_csv("{0}_test.csv".format(input_name))

    #df_train = df_train.union(df_val)
    
    df_train.cache()
    df_test.cache()
    
    print input_name
    print "Train, Test:", df_train.count(), df_test.count()
    print "iter & train prec & recall & f1 & accuracy & em & test prec & recall & f1 & accuracy & em"
    for maxIter in [5, 10, 25, 50, 75, 100]:
        clr = CustomLogisticRegression()
        clr.fit(df_train, maxIter=maxIter)
        df_pred_train = clr.predict(df_train)
        df_pred_test = clr.predict(df_test)

        r1 = evaluate(df_pred_train, clr.labelCols)
        r2 = evaluate(df_pred_test, clr.labelCols)
        r3 = evaluate_em(df_pred_train, clr.labelCols, metrics=["accuracy"])
        r4 = evaluate_em(df_pred_test, clr.labelCols, metrics=["accuracy"])
        
        print print_latex(maxIter, r1, r2, r3, r4)



In [7]:
def run_experiment2(input_name, depths=[5, 10, 20, 30]):
    df_train = read_csv("{0}_train.csv".format(input_name))
    df_val = read_csv("{0}_val.csv".format(input_name))
    df_test = read_csv("{0}_test.csv".format(input_name))

    #df_train = df_train.union(df_val)
    
    df_train.cache()
    df_test.cache()
    
    print input_name
    print "Train, Test:", df_train.count(), df_test.count()
    print "iter & train prec & recall & f1 & accuracy & em & test prec & recall & f1 & accuracy & em"        
    for maxDepth in depths:
        clr = CustomRandomForestClassifier()
        clr.fit(df_train, maxDepth=maxDepth)
        df_pred_train = clr.predict(df_train)
        df_pred_test = clr.predict(df_test)

        r1 = evaluate(df_pred_train, clr.labelCols)
        r2 = evaluate(df_pred_test, clr.labelCols)
        r3 = evaluate_em(df_pred_train, clr.labelCols, metrics=["accuracy"])
        r4 = evaluate_em(df_pred_test, clr.labelCols, metrics=["accuracy"])
        
        print print_latex(maxDepth, r1, r2, r3, r4)



In [8]:
run_experiment("./data/DATA_TFIDFV0_HADM_TOP10")
run_experiment("./data/DATA_TFIDFV1_HADM_TOP10")
run_experiment("./data/DATA_WORD2VECV0_HADM_TOP10")
run_experiment("./data/DATA_WORD2VECV1_HADM_TOP10")
run_experiment("./data/DATA_WORD2VECV2_HADM_TOP10")

./data/DATA_TFIDFV0_HADM_TOP10
Train, Test: 26363 13182
iter & train prec & recall & f1 & accuracy & em & test prec & recall & f1 & accuracy & em
5 & 0.9013 & 0.5084 & 0.6389 & 0.8937 & 0.4161 & 0.6599 & 0.2864 & 0.3834 & 0.8376 & 0.2643 \\ \hline
10 & 0.9805 & 0.9332 & 0.9561 & 0.9823 & 0.8749 & 0.5850 & 0.4269 & 0.4897 & 0.8370 & 0.2499 \\ \hline
25 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 0.5290 & 0.4445 & 0.4810 & 0.8232 & 0.2243 \\ \hline
50 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 0.5262 & 0.4474 & 0.4817 & 0.8222 & 0.2236 \\ \hline
75 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 0.5262 & 0.4474 & 0.4817 & 0.8222 & 0.2236 \\ \hline
100 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 0.5262 & 0.4474 & 0.4817 & 0.8222 & 0.2236 \\ \hline
./data/DATA_TFIDFV1_HADM_TOP10
Train, Test: 26363 13182
iter & train prec & recall & f1 & accuracy & em & test prec & recall & f1 & accuracy & em
5 & 0.8163 & 0.5689 & 0.6682 & 0.8958 & 0.3965 & 0.6642 & 0.3978 & 0.4946 & 0.8505 & 0.283

In [9]:
run_experiment2("./data/DATA_TFIDFV0_HADM_TOP10")
run_experiment2("./data/DATA_TFIDFV1_HADM_TOP10")
run_experiment2("./data/DATA_WORD2VECV0_HADM_TOP10")
run_experiment2("./data/DATA_WORD2VECV1_HADM_TOP10")
run_experiment2("./data/DATA_WORD2VECV2_HADM_TOP10")

./data/DATA_TFIDFV0_HADM_TOP10
Train, Test: 26363 13182
iter & train prec & recall & f1 & accuracy & em & test prec & recall & f1 & accuracy & em
5 & 0.5737 & 0.0184 & 0.0345 & 0.8079 & 0.2309 & 0.4613 & 0.0171 & 0.0320 & 0.8064 & 0.2316 \\ \hline
10 & 0.9790 & 0.1264 & 0.2047 & 0.8322 & 0.2573 & 0.6425 & 0.0799 & 0.1305 & 0.8189 & 0.2420 \\ \hline
20 & 0.9934 & 0.3869 & 0.5235 & 0.8893 & 0.4036 & 0.7557 & 0.1529 & 0.2265 & 0.8306 & 0.2552 \\ \hline
30 & 0.9975 & 0.5755 & 0.7051 & 0.9277 & 0.5664 & 0.7529 & 0.1881 & 0.2676 & 0.8354 & 0.2596 \\ \hline
./data/DATA_TFIDFV1_HADM_TOP10
Train, Test: 26363 13182
iter & train prec & recall & f1 & accuracy & em & test prec & recall & f1 & accuracy & em
5 & 0.6688 & 0.0537 & 0.0886 & 0.8160 & 0.2377 & 0.5070 & 0.0489 & 0.0804 & 0.8136 & 0.2370 \\ \hline
10 & 0.9677 & 0.1729 & 0.2648 & 0.8418 & 0.2710 & 0.7117 & 0.1176 & 0.1789 & 0.8265 & 0.2519 \\ \hline
20 & 0.9961 & 0.4899 & 0.6279 & 0.9102 & 0.4745 & 0.7806 & 0.1923 & 0.2735 & 0.8379 & 0.2673

In [10]:
run_experiment("./data/DATA_TFIDFV0_HADM_TOP50")
run_experiment("./data/DATA_TFIDFV1_HADM_TOP50")
run_experiment("./data/DATA_WORD2VECV0_HADM_TOP50")
run_experiment("./data/DATA_WORD2VECV1_HADM_TOP50")
run_experiment("./data/DATA_WORD2VECV2_HADM_TOP50")

./data/DATA_TFIDFV0_HADM_TOP50
Train, Test: 26363 13182
iter & train prec & recall & f1 & accuracy & em & test prec & recall & f1 & accuracy & em
5 & 0.9510 & 0.6080 & 0.7337 & 0.9594 & 0.2774 & 0.5207 & 0.2005 & 0.2709 & 0.9211 & 0.0908 \\ \hline
10 & 0.9956 & 0.9704 & 0.9827 & 0.9953 & 0.8630 & 0.4372 & 0.2711 & 0.3275 & 0.9161 & 0.0823 \\ \hline
25 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 0.3991 & 0.2725 & 0.3176 & 0.9105 & 0.0728 \\ \hline
50 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 0.3944 & 0.2742 & 0.3176 & 0.9099 & 0.0715 \\ \hline
75 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 0.3944 & 0.2742 & 0.3176 & 0.9099 & 0.0715 \\ \hline
100 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 0.3944 & 0.2742 & 0.3176 & 0.9099 & 0.0715 \\ \hline
./data/DATA_TFIDFV1_HADM_TOP50
Train, Test: 26363 13182
iter & train prec & recall & f1 & accuracy & em & test prec & recall & f1 & accuracy & em
5 & 0.8667 & 0.6447 & 0.7353 & 0.9580 & 0.2087 & 0.5327 & 0.2766 & 0.3529 & 0.9234 & 0.091

In [None]:
run_experiment2("./data/DATA_TFIDFV0_HADM_TOP50", depths=[5, 10, 20])
run_experiment2("./data/DATA_TFIDFV1_HADM_TOP50", depths=[5, 10, 20])
run_experiment2("./data/DATA_WORD2VECV0_HADM_TOP50", depths=[5, 10, 20])
run_experiment2("./data/DATA_WORD2VECV1_HADM_TOP50", depths=[5, 10, 20])
run_experiment2("./data/DATA_WORD2VECV2_HADM_TOP50", depths=[5, 10, 20])

./data/DATA_TFIDFV0_HADM_TOP50
Train, Test: 26363 13182
iter & train prec & recall & f1 & accuracy & em & test prec & recall & f1 & accuracy & em
5 & 0.1811 & 0.0392 & 0.0476 & 0.9155 & 0.0790 & 0.1350 & 0.0362 & 0.0438 & 0.9146 & 0.0765 \\ \hline


In [8]:
print "Done!"

Done!
