In [11]:
import pyspark
import argparse
from pyspark.ml.regression import GBTRegressionModel
from pyspark.sql import SparkSession
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml import PipelineModel, Pipeline
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler                    
from pyspark.ml.regression import GBTRegressor
import pyspark.sql.functions as func
from pyspark.mllib.evaluation import MulticlassMetrics
from pyspark.rdd import reduce


In [12]:
spark = SparkSession.builder.appName('WineApp_Prediction').getOrCreate()

In [13]:
def read_csv(file_path):
    return spark.read.format("com.databricks.spark.csv").csv(
        file_path, header=True, sep=";")

In [14]:
def load_model(path):
    return PipelineModel.load(path) # for random forest

In [16]:
def preprocess(df):
    total_columns = df.columns
    df = df.select(*(col(c).cast("double").alias(c) for c in df.columns))

    stages = []
    unlist = udf(lambda x: round(float(list(x)[0]),3), DoubleType())

    old_train_dataset_column_name = df.schema.names
    clean_train_dataset_column_name = []

    for name in old_train_dataset_column_name:
        clean_train_dataset_column_name.append(name.replace('"',''))
    print(clean_train_dataset_column_name)
    df = reduce(lambda df, idx: df.withColumnRenamed(old_train_dataset_column_name[idx], clean_train_dataset_column_name[idx]), range(len(clean_train_dataset_column_name)), df)

    
    for column_name in total_columns[:-1]:
        stages = []
        vectorAssembler = VectorAssembler(inputCols=[column_name],outputCol=column_name+'_vect')
        stages.append(vectorAssembler)
        stages.append(MinMaxScaler(inputCol=column_name+'_vect', outputCol=column_name+'_scaled'))
        pipeline = Pipeline(stages=stages)
        df = pipeline.fit(df).transform(df).withColumn(
            column_name+"_scaled", unlist(column_name+"_scaled")).drop(
            column_name+"_vect").drop(column_name)
    return df, total_columns

In [6]:
def get_predictions(model, df):
    return model.transform(df)

In [7]:
def run(test_file):
    df = read_csv(test_file)
    df, total_columns = preprocess(df)
    model = load_model("models/rf/")
    df = get_predictions(model, df)
    return df, total_columns

In [8]:
def print_f1(df, total_columns):
    label_column = total_columns[-1]
    predictionAndLabels = df.select(['indexedLabel', "prediction"])
    labels = df.select([label_column]).distinct()
    header = labels.rdd.first()
    labels = labels.rdd.filter(lambda line: line !=header)
    header = predictionAndLabels.rdd.first()
    copy_predictionAndLabels = predictionAndLabels.rdd.filter(lambda line: line != header)
    copy_predictionAndLabel = copy_predictionAndLabels.map(lambda lp: (float(lp[0]), float(lp[1])))
    metrics = MulticlassMetrics(copy_predictionAndLabel)
    # Overall statistics
    precision = metrics.precision()
    recall = metrics.recall()
    f1Score = metrics.fMeasure()
    print("Summary Stats")
    print("Precision = %s" % precision)
    print("Recall = %s" % recall)
    print("F1 Score = %s" % f1Score)

In [10]:
import argparse
parser = argparse.ArgumentParser(description='Wine Quality prediction')
parser.add_argument('--test_file', required=True, help='please provide test file path you can provide s3 path or local file path')
args = parser.parse_args()
df, total_columns = run(args.test_file)
print_f1(df, total_columns)

usage: ipykernel_launcher.py [-h] --test_file TEST_FILE
ipykernel_launcher.py: error: the following arguments are required: --test_file


SystemExit: 2