# Machine Learning Base Line

## Initialization

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *

conf = SparkConf().setAppName("preprocess").setMaster("local")
sc = SparkContext.getOrCreate(conf)
spark = SparkSession.builder.master("local").appName("preprocess").getOrCreate()

### Load and Split Data

In [2]:
from pyspark.mllib.util import Vectors, MLUtils
from pyspark.mllib.linalg import VectorUDT
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import DataType, StringType

def output_csv(df, path):
    udf = UserDefinedFunction(lambda x: Vectors.stringify(x), StringType())
    new_df = df.withColumn('features', udf(df.features))
    
    new_df.write.csv(path, header=True)
    
def read_csv(path):
    df = spark.read.csv(path, header=True, inferSchema=True)
    
    udf = UserDefinedFunction(lambda x: Vectors.parse(x), VectorUDT())
    # https://spark.apache.org/docs/latest/ml-migration-guides.html
    new_df = MLUtils.convertVectorColumnsToML(df.withColumn('features', udf(df.features)))
    
    return new_df

### Evaluator

In [3]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import StringType, IntegerType
import pyspark.sql.functions as F

concat_udf = F.udf(lambda cols: float(int("".join([str(int(x)) for x in cols]), 2)), DoubleType())

def evaluate(df, labelCols):
    labelCols2 = [i+"_pred" for i in labelCols]
    df.cache()
    
    r_list = {i: [] for i in ['accuracy', 'precision', 'recall', 'fmeasure']}
    for i in xrange(len(labelCols)):
        predandlabels = df.select(labelCols2[i], labelCols[i]).rdd \
                        .map(lambda x: (float(x[labelCols2[i]]), float(x[labelCols[i]])))
        metrics = MulticlassMetrics(predandlabels)

        # print metrics.confusionMatrix()
        r_list['accuracy'].append(metrics.accuracy)
        r_list['precision'].append(metrics.precision(1.0))
        r_list['recall'].append(metrics.recall(1.0))
        r_list['fmeasure'].append(metrics.fMeasure(label=1.0))

    results = {m: (sum(rs) / len(rs)) for (m, rs) in r_list.iteritems()}
            
    return results

def evaluate_em(df, labelCols, metrics=["f1", "weightedPrecision", "weightedRecall", "accuracy"]):
    evaluator = MulticlassClassificationEvaluator()
    labelCols2 = [i+"_pred" for i in labelCols]
    df2 = df.withColumn("_label", concat_udf(F.array(labelCols)))
    df2 = df2.withColumn("_pred", concat_udf(F.array(labelCols2)))
    
    output = {}
    for m in metrics:
        result = evaluator.evaluate(df2, {evaluator.metricName: m,
                                         evaluator.predictionCol: "_pred",
                                         evaluator.labelCol: "_label"})
        output[m] = result
        
    return output

## Logistic Regression

Define our custom Logistic Regression class

In [4]:
from pyspark.ml.classification import LogisticRegression

class CustomLogisticRegression:
    def __init__(self):
        pass
    
    def fit(self, df, maxIter=100, regParam=0.0, featuresCol="features", ignoreCols=["id"]):
        self.featuresCol = featuresCol
        self.labelCols = df.columns
        self.labelCols.remove("features")
        for c in ignoreCols:
            self.labelCols.remove(c)
        self.models = []
        
        for c in self.labelCols:
            lr = LogisticRegression(featuresCol=featuresCol,
                                    labelCol=c,
                                    predictionCol=c+"_pred",
                                    probabilityCol=c+"_prob",
                                    rawPredictionCol=c+"_rpred",
                                    maxIter=maxIter,
                                    regParam=regParam,
                                    family="binomial")
            model = lr.fit(df)
            self.models.append(model)
            
    def predict(self, df):
        df_out = df
        for c, m in zip(self.labelCols, self.models):
            df_out = m.transform(df_out)
            
        return df_out
        
        

## Random Forest

Define our custom Logistic Regression class

In [5]:
from pyspark.ml.classification import RandomForestClassifier

class CustomRandomForestClassifier:
    def __init__(self):
        pass
    
    def fit(self, df, maxDepth=5, maxBins=32, numTrees=20, regParam=0.0, featuresCol="features", ignoreCols=["id"]):
        self.featuresCol = featuresCol
        self.labelCols = df.columns
        self.labelCols.remove("features")
        for c in ignoreCols:
            self.labelCols.remove(c)
        self.models = []
        
        for c in self.labelCols:
            lr = RandomForestClassifier(featuresCol=featuresCol,
                                        labelCol=c,
                                        predictionCol=c+"_pred",
                                        probabilityCol=c+"_prob",
                                        rawPredictionCol=c+"_rpred",
                                        maxDepth=maxDepth,
                                        maxBins=maxBins,
                                        impurity="gini",
                                        numTrees=numTrees,
                                        seed=None)
            model = lr.fit(df)
            self.models.append(model)
            
    def predict(self, df):
        df_out = df
        for c, m in zip(self.labelCols, self.models):
            df_out = m.transform(df_out)
            
        return df_out

### Run Experiments

In [6]:
def print_latex(inum, m1, m2, m3, m4):
    r1 = "{precision:.4f} & {recall:.4f} & {fmeasure:.4f} & {accuracy:.4f}".format(**m1)
    r2 = "{precision:.4f} & {recall:.4f} & {fmeasure:.4f} & {accuracy:.4f}".format(**m2)
    r3 = "{accuracy:.4f}".format(**m3)
    r4 = "{accuracy:.4f}".format(**m4)
    return "{0} & {1} & {2} & {3} & {4} \\\\ \hline".format(inum, r1, r3, r2, r4)
    
def run_experiment(input_name):
    df_train = read_csv("{0}_train.csv".format(input_name))
    df_val = read_csv("{0}_val.csv".format(input_name))
    df_test = read_csv("{0}_test.csv".format(input_name))

    df_train = df_train.union(df_val)
    
    df_train.cache()
    df_test.cache()
    
    print input_name
    print "Train, Test:", df_train.count(), df_test.count()
    print "iter & train prec & recall & f1 & accuracy & em & test prec & recall & f1 & accuracy & em"
    for maxIter in [5, 10, 25, 50, 75, 100]:
        clr = CustomLogisticRegression()
        clr.fit(df_train, maxIter=maxIter)
        df_pred_train = clr.predict(df_train)
        df_pred_test = clr.predict(df_test)

        r1 = evaluate(df_pred_train, clr.labelCols)
        r2 = evaluate(df_pred_test, clr.labelCols)
        r3 = evaluate_em(df_pred_train, clr.labelCols, metrics=["accuracy"])
        r4 = evaluate_em(df_pred_test, clr.labelCols, metrics=["accuracy"])
        
        print print_latex(maxIter, r1, r2, r3, r4)



In [7]:
def run_experiment2(input_name):
    df_train = read_csv("{0}_train.csv".format(input_name))
    df_val = read_csv("{0}_val.csv".format(input_name))
    df_test = read_csv("{0}_test.csv".format(input_name))

    df_train = df_train.union(df_val)
    
    df_train.cache()
    df_test.cache()
    
    print input_name
    print "Train, Test:", df_train.count(), df_test.count()
    print "iter & train prec & recall & f1 & accuracy & em & test prec & recall & f1 & accuracy & em"        
    for maxDepth in [5, 10, 20, 30]:
        clr = CustomRandomForestClassifier()
        clr.fit(df_train, maxDepth=maxDepth)
        df_pred_train = clr.predict(df_train)
        df_pred_test = clr.predict(df_test)

        r1 = evaluate(df_pred_train, clr.labelCols)
        r2 = evaluate(df_pred_test, clr.labelCols)
        r3 = evaluate_em(df_pred_train, clr.labelCols, metrics=["accuracy"])
        r4 = evaluate_em(df_pred_test, clr.labelCols, metrics=["accuracy"])
        
        print print_latex(maxDepth, r1, r2, r3, r4)



In [8]:
run_experiment("./data/DATA_TFIDFV0_HADM_TOP10")
run_experiment("./data/DATA_TFIDFV1_HADM_TOP10")
run_experiment("./data/DATA_WORD2VEC_HADM_TOP10")

./data/DATA_TFIDFV0_HADM_TOP10
Train, Test: 39544 13182
iter & train prec & recall & f1 & accuracy & em & test prec & recall & f1 & accuracy & em
5 & 0.8678 & 0.4640 & 0.5905 & 0.8848 & 0.3809 & 0.6756 & 0.3032 & 0.4036 & 0.8417 & 0.2704 \\ \hline
10 & 0.9602 & 0.8912 & 0.9241 & 0.9701 & 0.7916 & 0.5964 & 0.4561 & 0.5138 & 0.8411 & 0.2558 \\ \hline
25 & 0.9998 & 0.9999 & 0.9999 & 0.9999 & 0.9991 & 0.5208 & 0.4554 & 0.4844 & 0.8206 & 0.2189 \\ \hline
50 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 0.5165 & 0.4516 & 0.4804 & 0.8190 & 0.2176 \\ \hline
75 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 0.5165 & 0.4515 & 0.4804 & 0.8190 & 0.2176 \\ \hline
100 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 0.5165 & 0.4515 & 0.4804 & 0.8190 & 0.2176 \\ \hline
./data/DATA_TFIDFV1_HADM_TOP10
Train, Test: 39544 13182
iter & train prec & recall & f1 & accuracy & em & test prec & recall & f1 & accuracy & em
5 & 0.8060 & 0.5491 & 0.6508 & 0.8915 & 0.3833 & 0.6689 & 0.4006 & 0.4977 & 0.8513 & 0.284

In [9]:
run_experiment2("./data/DATA_TFIDFV0_HADM_TOP10")
run_experiment2("./data/DATA_TFIDFV1_HADM_TOP10")
run_experiment2("./data/DATA_WORD2VEC_HADM_TOP10")

./data/DATA_TFIDFV0_HADM_TOP10
Train, Test: 39544 13182
iter & train prec & recall & f1 & accuracy & em & test prec & recall & f1 & accuracy & em
5 & 0.4722 & 0.0172 & 0.0324 & 0.8076 & 0.2329 & 0.3556 & 0.0166 & 0.0310 & 0.8062 & 0.2313 \\ \hline
10 & 0.8717 & 0.1211 & 0.1952 & 0.8311 & 0.2566 & 0.6316 & 0.0842 & 0.1357 & 0.8201 & 0.2440 \\ \hline
20 & 0.9924 & 0.3686 & 0.5038 & 0.8860 & 0.3940 & 0.7632 & 0.1571 & 0.2306 & 0.8316 & 0.2581 \\ \hline
30 & 0.9974 & 0.5644 & 0.6957 & 0.9258 & 0.5583 & 0.7462 & 0.1931 & 0.2725 & 0.8366 & 0.2654 \\ \hline
./data/DATA_TFIDFV1_HADM_TOP10
Train, Test: 39544 13182
iter & train prec & recall & f1 & accuracy & em & test prec & recall & f1 & accuracy & em
5 & 0.4706 & 0.0327 & 0.0568 & 0.8112 & 0.2356 & 0.4658 & 0.0321 & 0.0557 & 0.8099 & 0.2350 \\ \hline
10 & 0.9710 & 0.1340 & 0.2075 & 0.8341 & 0.2613 & 0.6449 & 0.0999 & 0.1532 & 0.8236 & 0.2469 \\ \hline
20 & 0.9939 & 0.4277 & 0.5658 & 0.8974 & 0.4252 & 0.7992 & 0.1806 & 0.2578 & 0.8369 & 0.2635

In [8]:
print "Done!"

Done!
