# Machine Learning Base Line

## Initialization

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *

conf = SparkConf().setAppName("preprocess").setMaster("local")
sc = SparkContext.getOrCreate(conf)
spark = SparkSession.builder.master("local").appName("preprocess").getOrCreate()

### Load and Split Data

In [2]:
from pyspark.mllib.util import Vectors, MLUtils
from pyspark.mllib.linalg import VectorUDT
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import DataType, StringType

def output_csv(df, path):
    udf = UserDefinedFunction(lambda x: Vectors.stringify(x), StringType())
    new_df = df.withColumn('features', udf(df.features))
    
    new_df.write.csv(path, header=True)
    
def read_csv(path):
    df = spark.read.csv(path, header=True, inferSchema=True)
    
    udf = UserDefinedFunction(lambda x: Vectors.parse(x), VectorUDT())
    # https://spark.apache.org/docs/latest/ml-migration-guides.html
    new_df = MLUtils.convertVectorColumnsToML(df.withColumn('features', udf(df.features)))
    
    return new_df

### Evaluator

In [3]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.sql.types import StringType, IntegerType
import pyspark.sql.functions as F

concat_udf = F.udf(lambda cols: float(int("".join([str(int(x)) for x in cols]), 2)), DoubleType())

def evaluate(df, labelCols, metrics=["f1", "weightedPrecision", "weightedRecall", "accuracy"]):
    evaluator = MulticlassClassificationEvaluator()
    labelCols2 = [i+"_pred" for i in labelCols]
    
    output = {}
    for m in metrics:
        results = []
        for i in xrange(len(labelCols)):
            r = evaluator.evaluate(df, {evaluator.metricName: m,
                                        evaluator.predictionCol: labelCols2[i],
                                        evaluator.labelCol: labelCols[i]})
            results.append(r)
        output[m] = sum(results) / len(results)
        
    return output

def evaluate_em(df, labelCols, metrics=["f1", "weightedPrecision", "weightedRecall", "accuracy"]):
    evaluator = MulticlassClassificationEvaluator()
    labelCols2 = [i+"_pred" for i in labelCols]
    df2 = df.withColumn("_label", concat_udf(F.array(labelCols)))
    df2 = df2.withColumn("_pred", concat_udf(F.array(labelCols2)))
    
    output = {}
    for m in metrics:
        result = evaluator.evaluate(df2, {evaluator.metricName: m,
                                         evaluator.predictionCol: "_pred",
                                         evaluator.labelCol: "_label"})
        output[m] = result
        
    return output
    

## Logistic Regression

Define our custom Logistic Regression class

In [4]:
from pyspark.ml.classification import LogisticRegression

class CustomLogisticRegression:
    def __init__(self):
        pass
    
    def fit(self, df, maxIter=100, regParam=0.0, featuresCol="features", ignoreCols=["id"]):
        self.featuresCol = featuresCol
        self.labelCols = df.columns
        self.labelCols.remove("features")
        for c in ignoreCols:
            self.labelCols.remove(c)
        self.models = []
        
        for c in self.labelCols:
            lr = LogisticRegression(featuresCol=featuresCol,
                                    labelCol=c,
                                    predictionCol=c+"_pred",
                                    probabilityCol=c+"_prob",
                                    rawPredictionCol=c+"_rpred",
                                    maxIter=maxIter,
                                    regParam=regParam,
                                    family="binomial")
            model = lr.fit(df)
            self.models.append(model)
            
    def predict(self, df):
        df_out = df
        for c, m in zip(self.labelCols, self.models):
            df_out = m.transform(df_out)
            
        return df_out
        
        

## Random Forest

Define our custom Logistic Regression class

In [5]:
from pyspark.ml.classification import RandomForestClassifier

class CustomRandomForestClassifier:
    def __init__(self):
        pass
    
    def fit(self, df, maxDepth=5, maxBins=32, numTrees=20, regParam=0.0, featuresCol="features", ignoreCols=["id"]):
        self.featuresCol = featuresCol
        self.labelCols = df.columns
        self.labelCols.remove("features")
        for c in ignoreCols:
            self.labelCols.remove(c)
        self.models = []
        
        for c in self.labelCols:
            lr = RandomForestClassifier(featuresCol=featuresCol,
                                        labelCol=c,
                                        predictionCol=c+"_pred",
                                        probabilityCol=c+"_prob",
                                        rawPredictionCol=c+"_rpred",
                                        maxDepth=maxDepth,
                                        maxBins=maxBins,
                                        impurity="gini",
                                        numTrees=numTrees,
                                        seed=None)
            model = lr.fit(df)
            self.models.append(model)
            
    def predict(self, df):
        df_out = df
        for c, m in zip(self.labelCols, self.models):
            df_out = m.transform(df_out)
            
        return df_out

### Run Experiments

In [6]:
def print_latex(inum, m1, m2, m3, m4):
    r1 = "{weightedPrecision:.4f} & {weightedRecall:.4f} & {f1:.4f} & {accuracy:.4f}".format(**m1)
    r2 = "{weightedPrecision:.4f} & {weightedRecall:.4f} & {f1:.4f} & {accuracy:.4f}".format(**m2)
    r3 = "{accuracy:.4f}".format(**m3)
    r4 = "{accuracy:.4f}".format(**m4)
    return "{0} & {1} & {2} & {3} & {4} \\\\ \hline".format(inum, r1, r3, r2, r4)
    
def run_experiment(input_name):
    df_train = read_csv("{0}_train.csv".format(input_name))
    df_val = read_csv("{0}_val.csv".format(input_name))
    df_test = read_csv("{0}_test.csv".format(input_name))

    df_train = df_train.union(df_val)
    
    df_train.cache()
    df_test.cache()
    
    print input_name
    print "Train, Test:", df_train.count(), df_test.count()
    print "iter & train prec & recall & f1 & accuracy & em & test prec & recall & f1 & accuracy & em"
    for maxIter in [5, 10, 25, 50, 75, 100]:
        clr = CustomLogisticRegression()
        clr.fit(df_train, maxIter=maxIter)
        df_pred_train = clr.predict(df_train)
        df_pred_test = clr.predict(df_test)

        r1 = evaluate(df_pred_train, clr.labelCols)
        r2 = evaluate(df_pred_test, clr.labelCols)
        r3 = evaluate_em(df_pred_train, clr.labelCols, metrics=["accuracy"])
        r4 = evaluate_em(df_pred_test, clr.labelCols, metrics=["accuracy"])
        
        print print_latex(maxIter, r1, r2, r3, r4)
        
#     for maxDepth in [5, 10, 20, 30]:
#         clr = CustomRandomForestClassifier()
#         clr.fit(df_train, maxDepth=maxDepth)
#         df_pred_train = clr.predict(df_train)
#         df_pred_test = clr.predict(df_test)

#         print "maxDepth: ", maxDepth
#         print evaluate(df_pred_train, clr.labelCols)
#         print evaluate(df_pred_test, clr.labelCols)



In [7]:
run_experiment("./data/DATA_TFIDFV0_HADM_TOP10")
run_experiment("./data/DATA_TFIDFV1_HADM_TOP10")
run_experiment("./data/DATA_WORD2VEC_HADM_TOP10")

./data/DATA_TFIDFV0_HADM_TOP10
Train, Test: 39544 13182
iter & train prec & recall & f1 & accuracy & test prec & recall & f1 & accuracy 
5 & 0.8833 & 0.8848 & 0.8702 & 0.8848 & 0.3809 & 0.8244 & 0.8417 & 0.8163 & 0.8417 & 0.2704 \\ \hline
10 & 0.9699 & 0.9701 & 0.9697 & 0.9701 & 0.7916 & 0.8293 & 0.8411 & 0.8328 & 0.8411 & 0.2558 \\ \hline
25 & 0.9999 & 0.9999 & 0.9999 & 0.9999 & 0.9991 & 0.8121 & 0.8206 & 0.8156 & 0.8206 & 0.2189 \\ \hline
50 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 0.8104 & 0.8190 & 0.8140 & 0.8190 & 0.2176 \\ \hline
75 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 0.8104 & 0.8190 & 0.8140 & 0.8190 & 0.2176 \\ \hline
100 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 1.0000 & 0.8104 & 0.8190 & 0.8140 & 0.8190 & 0.2176 \\ \hline
./data/DATA_TFIDFV1_HADM_TOP10
Train, Test: 39544 13182
iter & train prec & recall & f1 & accuracy & test prec & recall & f1 & accuracy 
5 & 0.8862 & 0.8915 & 0.8833 & 0.8915 & 0.3833 & 0.8371 & 0.8513 & 0.8365 & 0.8513 & 0.2847 \\ \hline
10 & 0

In [8]:
print "Done!"

Done!
