# Machine Learning Base Line

## Initialization

In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql.types import *

conf = SparkConf().setAppName("preprocess").setMaster("local")
sc = SparkContext.getOrCreate(conf)
spark = SparkSession.builder.master("local").appName("preprocess").getOrCreate()

### Load and Split Data

In [2]:
from pyspark.mllib.util import Vectors, MLUtils
from pyspark.mllib.linalg import VectorUDT
from pyspark.sql.functions import UserDefinedFunction
from pyspark.sql.types import DataType, StringType

def output_csv(df, path):
    udf = UserDefinedFunction(lambda x: Vectors.stringify(x), StringType())
    new_df = df.withColumn('features', udf(df.features))
    
    new_df.write.csv(path, header=True)
    
def read_csv(path):
    df = spark.read.csv(path, header=True, inferSchema=True)
    
    udf = UserDefinedFunction(lambda x: Vectors.parse(x), VectorUDT())
    # https://spark.apache.org/docs/latest/ml-migration-guides.html
    new_df = MLUtils.convertVectorColumnsToML(df.withColumn('features', udf(df.features)))
    
    return new_df

### Evaluator

In [3]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.sql.types import StringType, IntegerType
import pyspark.sql.functions as F

concat_udf = F.udf(lambda cols: float(int("".join([str(int(x)) for x in cols]), 2)), DoubleType())

def evaluate(df, labelCols):
    labelCols2 = [i+"_pred" for i in labelCols]
    df.cache()
    
    r_list = {i: [] for i in ['accuracy', 'precision', 'recall', 'fmeasure']}
    for i in xrange(len(labelCols)):
        predandlabels = df.select(labelCols2[i], labelCols[i]).rdd \
                        .map(lambda x: (float(x[labelCols2[i]]), float(x[labelCols[i]])))
        metrics = MulticlassMetrics(predandlabels)

        # print metrics.confusionMatrix()
        r_list['accuracy'].append(metrics.accuracy)
        r_list['precision'].append(metrics.precision(1.0))
        r_list['recall'].append(metrics.recall(1.0))
        r_list['fmeasure'].append(metrics.fMeasure(label=1.0))

    results = {m: (sum(rs) / len(rs)) for (m, rs) in r_list.iteritems()}
            
    return results

def evaluate_em(df, labelCols, metrics=["f1", "weightedPrecision", "weightedRecall", "accuracy"]):
    evaluator = MulticlassClassificationEvaluator()
    labelCols2 = [i+"_pred" for i in labelCols]
    df2 = df.withColumn("_label", concat_udf(F.array(labelCols)))
    df2 = df2.withColumn("_pred", concat_udf(F.array(labelCols2)))
    
    output = {}
    for m in metrics:
        result = evaluator.evaluate(df2, {evaluator.metricName: m,
                                         evaluator.predictionCol: "_pred",
                                         evaluator.labelCol: "_label"})
        output[m] = result
        
    return output

## Logistic Regression

Define our custom Logistic Regression class

In [4]:
from pyspark.ml.classification import LogisticRegression

class CustomLogisticRegression:
    def __init__(self):
        pass
    
    def fit(self, df, maxIter=100, regParam=0.0, featuresCol="features", ignoreCols=["id"]):
        self.featuresCol = featuresCol
        self.labelCols = df.columns
        self.labelCols.remove("features")
        for c in ignoreCols:
            self.labelCols.remove(c)
        self.models = []
        
        for c in self.labelCols:
            lr = LogisticRegression(featuresCol=featuresCol,
                                    labelCol=c,
                                    predictionCol=c+"_pred",
                                    probabilityCol=c+"_prob",
                                    rawPredictionCol=c+"_rpred",
                                    maxIter=maxIter,
                                    regParam=regParam,
                                    family="binomial")
            model = lr.fit(df)
            self.models.append(model)
            
    def predict(self, df):
        df_out = df
        for c, m in zip(self.labelCols, self.models):
            df_out = m.transform(df_out)
            
        return df_out
        
        

## Random Forest

Define our custom Logistic Regression class

In [5]:
from pyspark.ml.classification import RandomForestClassifier

class CustomRandomForestClassifier:
    def __init__(self):
        pass
    
    def fit(self, df, maxDepth=5, maxBins=32, numTrees=20, regParam=0.0, featuresCol="features", ignoreCols=["id"]):
        self.featuresCol = featuresCol
        self.labelCols = df.columns
        self.labelCols.remove("features")
        for c in ignoreCols:
            self.labelCols.remove(c)
        self.models = []
        
        for c in self.labelCols:
            lr = RandomForestClassifier(featuresCol=featuresCol,
                                        labelCol=c,
                                        predictionCol=c+"_pred",
                                        probabilityCol=c+"_prob",
                                        rawPredictionCol=c+"_rpred",
                                        maxDepth=maxDepth,
                                        maxBins=maxBins,
                                        impurity="gini",
                                        numTrees=numTrees,
                                        seed=None)
            model = lr.fit(df)
            self.models.append(model)
            
    def predict(self, df):
        df_out = df
        for c, m in zip(self.labelCols, self.models):
            df_out = m.transform(df_out)
            
        return df_out

### Run Experiments

In [6]:
def print_latex(inum, m1, m2, m3, m4):
    r1 = "{precision:.4f} & {recall:.4f} & {fmeasure:.4f} & {accuracy:.4f}".format(**m1)
    r2 = "{precision:.4f} & {recall:.4f} & {fmeasure:.4f} & {accuracy:.4f}".format(**m2)
    r3 = "{accuracy:.4f}".format(**m3)
    r4 = "{accuracy:.4f}".format(**m4)
    return "{0} & {1} & {2} & {3} & {4} \\\\ \hline".format(inum, r1, r3, r2, r4)
    
def run_experiment(input_name):
    df_train = read_csv("{0}_train.csv".format(input_name))
    df_val = read_csv("{0}_val.csv".format(input_name))
    df_test = read_csv("{0}_test.csv".format(input_name))

    #df_train = df_train.union(df_val)
    
    df_train.cache()
    df_test.cache()
    
    print input_name
    print "Train, Test:", df_train.count(), df_test.count()
    print "iter & train prec & recall & f1 & accuracy & em & test prec & recall & f1 & accuracy & em"
    for maxIter in [5, 10, 25, 50, 75, 100]:
        clr = CustomLogisticRegression()
        clr.fit(df_train, maxIter=maxIter)
        df_pred_train = clr.predict(df_train)
        df_pred_test = clr.predict(df_test)

        r1 = evaluate(df_pred_train, clr.labelCols)
        r2 = evaluate(df_pred_test, clr.labelCols)
        r3 = evaluate_em(df_pred_train, clr.labelCols, metrics=["accuracy"])
        r4 = evaluate_em(df_pred_test, clr.labelCols, metrics=["accuracy"])
        
        print print_latex(maxIter, r1, r2, r3, r4)



In [7]:
def run_experiment2(input_name, depths=[5, 10, 20, 30]):
    df_train = read_csv("{0}_train.csv".format(input_name))
    df_val = read_csv("{0}_val.csv".format(input_name))
    df_test = read_csv("{0}_test.csv".format(input_name))

    #df_train = df_train.union(df_val)
    
    df_train.cache()
    df_test.cache()
    
    print input_name
    print "Train, Test:", df_train.count(), df_test.count()
    print "iter & train prec & recall & f1 & accuracy & em & test prec & recall & f1 & accuracy & em"        
    for maxDepth in depths:
        clr = CustomRandomForestClassifier()
        clr.fit(df_train, maxDepth=maxDepth)
        df_pred_train = clr.predict(df_train)
        df_pred_test = clr.predict(df_test)

        r1 = evaluate(df_pred_train, clr.labelCols)
        r2 = evaluate(df_pred_test, clr.labelCols)
        r3 = evaluate_em(df_pred_train, clr.labelCols, metrics=["accuracy"])
        r4 = evaluate_em(df_pred_test, clr.labelCols, metrics=["accuracy"])
        
        print print_latex(maxDepth, r1, r2, r3, r4)



In [8]:
run_experiment("./data/DATA_TFIDFV0_HADM_TOP10")
run_experiment("./data/DATA_TFIDFV1_HADM_TOP10")
run_experiment("./data/DATA_WORD2VECV0_HADM_TOP10")
run_experiment("./data/DATA_WORD2VECV1_HADM_TOP10")
run_experiment("./data/DATA_WORD2VECV2_HADM_TOP10")

./data/DATA_TFIDFV0_HADM_TOP10
Train, Test: 26363 13182
iter & train prec & recall & f1 & accuracy & em & test prec & recall & f1 & accuracy & em


Name: org.apache.toree.interpreter.broker.BrokerException
Message: Traceback (most recent call last):
  File "/tmp/kernel-PySpark-f816a37e-bcdc-4186-8e76-863d312765fe/pyspark_runner.py", line 189, in <module>
    eval(compiled_code)
  File "<string>", line 1, in <module>
  File "<string>", line 18, in run_experiment
  File "<string>", line 21, in fit
  File "/home/docker-user/cse6250-final-project/spark-2.1.0-bin-hadoop2.7/python/pyspark/ml/base.py", line 64, in fit
    return self._fit(dataset)
  File "/home/docker-user/cse6250-final-project/spark-2.1.0-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 236, in _fit
    java_model = self._fit_java(dataset)
  File "/home/docker-user/cse6250-final-project/spark-2.1.0-bin-hadoop2.7/python/pyspark/ml/wrapper.py", line 233, in _fit_java
    return self._java_obj.fit(dataset._jdf)
  File "/home/docker-user/cse6250-final-project/spark-2.1.0-bin-hadoop2.7/python/lib/py4j-0.10.4-src.zip/py4j/java_gateway.py", line 1131, in __call__
    answer =

In [9]:
run_experiment2("./data/DATA_TFIDFV0_HADM_TOP10")
run_experiment2("./data/DATA_TFIDFV1_HADM_TOP10")
run_experiment2("./data/DATA_WORD2VECV0_HADM_TOP10")
run_experiment2("./data/DATA_WORD2VECV1_HADM_TOP10")
run_experiment2("./data/DATA_WORD2VECV2_HADM_TOP10")

Name: org.apache.toree.interpreter.broker.BrokerException
Message: null was reset!
StackTrace: org.apache.toree.interpreter.broker.BrokerState$$anonfun$reset$1.apply(BrokerState.scala:191)
org.apache.toree.interpreter.broker.BrokerState$$anonfun$reset$1.apply(BrokerState.scala:189)
scala.collection.Iterator$class.foreach(Iterator.scala:893)
scala.collection.AbstractIterator.foreach(Iterator.scala:1336)
scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
scala.collection.AbstractIterable.foreach(Iterable.scala:54)
org.apache.toree.interpreter.broker.BrokerState.reset(BrokerState.scala:189)
org.apache.toree.kernel.interpreter.pyspark.PySparkService$$anonfun$pySparkProcess$2.apply(PySparkService.scala:63)
org.apache.toree.kernel.interpreter.pyspark.PySparkService$$anonfun$pySparkProcess$2.apply(PySparkService.scala:61)
org.apache.toree.interpreter.broker.BrokerProcessHandler.onProcessComplete(BrokerProcessHandler.scala:67)
org.apache.commons.exec.DefaultExecutor$1.run(Defau

In [10]:
run_experiment("./data/DATA_TFIDFV0_HADM_TOP50")
run_experiment("./data/DATA_TFIDFV1_HADM_TOP50")
run_experiment("./data/DATA_WORD2VECV0_HADM_TOP50")
run_experiment("./data/DATA_WORD2VECV1_HADM_TOP50")
run_experiment("./data/DATA_WORD2VECV2_HADM_TOP50")

Name: org.apache.toree.interpreter.broker.BrokerException
Message: Traceback (most recent call last):
  File "/tmp/kernel-PySpark-f816a37e-bcdc-4186-8e76-863d312765fe/pyspark_runner.py", line 189, in <module>
    eval(compiled_code)
  File "<string>", line 1, in <module>
NameError: name 'run_experiment' is not defined

StackTrace: org.apache.toree.interpreter.broker.BrokerState$$anonfun$markFailure$1.apply(BrokerState.scala:163)
org.apache.toree.interpreter.broker.BrokerState$$anonfun$markFailure$1.apply(BrokerState.scala:163)
scala.Option.foreach(Option.scala:257)
org.apache.toree.interpreter.broker.BrokerState.markFailure(BrokerState.scala:162)
sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
java.lang.reflect.Method.invoke(Method.java:498)
py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
py4j.reflec

In [None]:
run_experiment2("./data/DATA_WORD2VEC_HADM_TOP50")
run_experiment2("./data/DATA_TFIDFV0_HADM_TOP50")
run_experiment2("./data/DATA_TFIDFV1_HADM_TOP50")

In [12]:
print "Done!"

Done!
