In [1]:
#task1_2a
from pyspark.sql import SparkSession


def startSpark(name="spark-etl"):
    """Start Spark Sessions
    
    Args:
        param name: Spark job name
    Return:
        return: SparkSession object
    """
    spark = (SparkSession
             .builder
             .appName(name)
             .getOrCreate())
    return spark


def extractCSV(filepath):
    """Opens CSV and return list of products
    
    Args:
        filepath: path to downloaded csv file
    Return:
        csv: list of products
    """
    with open(filepath, 'r') as csvfile:
        csvtext = csvfile.readlines()
    csv = []
    for i in csvtext:
        line = i.replace("\n", "")
        add = line.split(',')
        csv.extend(add)
    return csv


def transformRDD(spark, csv):
    """Transforms cvs list into Spark RDD,
    collects unique
    
    Args:
        filepath: path to downloaded csv file
    Return:
        rddUnique: pyspark.rdd.RDD
    """
    rdd=spark.sparkContext.parallelize(csv)
    rddUnique = rdd.coalesce(1).distinct()
    return rddUnique

    
def loadCSV(rdd, outputPath):
    """Load CSV into folder
    
    Args:
        filepath: path to downloaded csv file
    Return:
        None
    """
    rdd.saveAsTextFile(outputPath)


def main():
    """Main ETL script definition
    
    """
    filePath = '../files/groceries.csv'
    outPutPath = '../out/out_1_2a.txt'
    spark = startSpark()
    csv = extractCSV(filePath)
    rdd = transformRDD(spark, csv)
    loadCSV(rdd, outPutPath)

    
if __name__ == '__main__':
    main()

In [2]:
#task1_2b
from pyspark.sql import SparkSession


def startSpark(name="spark-etl"):
    """Start Spark Sessions
    
    Args:
        param name: Spark job name
    Return:
        spark: SparkSession object
    """
    spark = (SparkSession
             .builder
             .appName(name)
             .getOrCreate())
    return spark


def extractCSV(filepath):
    """Opens CSV and return list of products
    
    Args:
        filepath: path to downloaded csv file
    Return:
        csv: list of products
    """
    with open(filepath, 'r') as csvfile:
        csvtext = csvfile.readlines()
    csv = []
    for i in csvtext:
        line = i.replace("\n", "")
        add = line.split(',')
        csv.extend(add)
    return csv


def transformRDD(spark, csv):
    """Transforms cvs list into Spark RDD,
    collects unique and get number of products
    
    Args:
        filepath: path to downloaded csv file
    Return:
        rddUnique: pyspark.rdd.RDD
    """
    rdd=spark.sparkContext.parallelize(csv)
    rddUnique = rdd.distinct()
    rddUniqueCount = rddUnique.count()
    rddUnique=spark.sparkContext.parallelize([rddUniqueCount])
    return rddUnique

    
def loadCSV(rdd, outputPath):
    """Load CSV into folder
    
    Args:
        filepath: path to downloaded csv file
    Return:
        None
    """
    rdd.coalesce(1).saveAsTextFile(outputPath)


def main():
    """Main ETL script definition
    
    """
    filePath = '../files/groceries.csv'
    outPutPath = '../out/out_1_2b.txt'
    spark = startSpark()
    csv = extractCSV(filePath)
    rdd = transformRDD(spark, csv)
    loadCSV(rdd, outPutPath)

    
if __name__ == '__main__':
    main()

In [3]:
#task1_3
from pyspark.sql import SparkSession


def startSpark(name="spark-etl"):
    """Start Spark Sessions
    
    Args:
        param name: Spark job name
    Return:
        spark: SparkSession object
    """
    spark = (SparkSession
             .builder
             .appName(name)
             .getOrCreate())
    return spark


def extractCSV(filepath):
    """Opens CSV and return list of products
    
    Args:
        filepath: path to downloaded csv file
    Return:
        csv: list of products
    """
    with open(filepath, 'r') as csvfile:
        csvtext = csvfile.readlines()
    csv = []
    for i in csvtext:
        line = i.replace("\n", "")
        add = line.split(',')
        csv.extend(add)
    return csv


def transformRDD(spark, csv):
    """Transforms cvs list into Spark RDD,
    collects unique and get number of products
    
    Args:
        filepath: path to downloaded csv file
    Return:
        return: pyspark.rdd.RDD
    """
    rdd=spark.sparkContext.parallelize(csv)
    values = rdd.countByValue().items()
    rddValues=spark.sparkContext.parallelize(values)
    return rddValues

    
def loadCSV(rdd, outputPath):
    """Load CSV into folder
    
    Args:
        filepath: path to downloaded csv file
    Return:
        return: None
    """
    rdd.coalesce(1).saveAsTextFile(outputPath)


def main():
    """Main ETL script definition
    
    """
    filePath = '../files/groceries.csv'
    outPutPath = '../out/out_1_3.txt'
    spark = startSpark()
    csv = extractCSV(filePath)
    rdd = transformRDD(spark, csv)
    loadCSV(rdd, outPutPath)

    
if __name__ == '__main__':
    main()

In [8]:
#task2_2
from pyspark.sql import SparkSession
import pyspark.sql.functions as f


def startSpark(name="spark-etl"):
    """Start Spark Sessions
    
    Args:
        param name: Spark job name
    Return:
        spark: SparkSession object
    """
    spark = (SparkSession
             .builder
             .appName(name)
             .getOrCreate())
    return spark


def extractCSV(filepath):
    """Opens CSV and return list of products
    
    Args:
        filepath: path to downloaded csv file
    Return:
        csv: list of products
    """
    with open(filepath, 'r') as csvfile:
        csvtext = csvfile.readlines()
    csv = []
    for i in csvtext:
        line = i.replace("\n", "")
        add = line.split(',')
        csv.extend(add)
    return csv


def transformRDD(spark, csv):
    """Transforms cvs list into Spark RDD,
    collects unique and get number of products
    
    Args:
        filepath: path to downloaded csv file
    Return:
        return: pyspark.rdd.RDD
    """
    rdd=spark.sparkContext.parallelize(csv)
    values = rdd.countByValue().items()
    rddValues=spark.sparkContext.parallelize(values)
    return rddValues

    
def loadCSV(rdd, outputPath):
    """Load CSV into folder
    
    Args:
        filepath: path to downloaded csv file
    Return:
        return: None
    """
    rdd.coalesce(1).saveAsTextFile(outputPath)


def main():
    """Main ETL script definition
    
    """
    filePath = '../files/groceries.csv'
    outPutPath = '../out/out_2_2.txt'
    spark = startSpark()
    csv = extractCSV(filePath)
    rdd = transformRDD(spark, csv)
    loadCSV(rdd, outPutPath)

    
if __name__ == '__main__':
    main()

In [5]:
#task2_3
from pyspark.sql import SparkSession
import pyspark.sql.functions as f


def startSpark(name="spark-etl"):
    """Start Spark Sessions
    
    Args:
        param name: Spark job name
    Return:
        return: SparkSession object
    """
    spark = (SparkSession
             .builder
             .appName(name)
             .getOrCreate())
    return spark


def extractDF(spark, filepath):
    """Opens parquet file
    
    Args:
        spark: pyspark.sql.session.SparkSession
        filepath: path to downloaded parquet file
    Return:
        df: list of products, pyspark.sql.dataframe.DataFrame
    """
    df = spark.read.parquet(filepath)
    return df


def transformDF(df):
    """Transforms df into stats
    
    Args:
        df: raw dataframe; pyspark.sql.dataframe.DataFrame
    Return:
        df: prepared dataframe; pyspark.sql.dataframe.DataFrame
    """
    dfFiltered = df.filter(df.price > 5000).filter(df.review_scores_value == 10)
    dfSelected = dfFiltered.select(f.mean('bathrooms'), f.mean('bedrooms'))
    dfSelected = dfSelected.withColumnRenamed("avg(bathrooms)",
                                                "avg_bathrooms").withColumnRenamed("avg(bedrooms)",
                                                                                   "avg_bedrooms")
    
    return dfSelected

    
def loadDF(df, outputPath):
    """Load DF into folder
    
    Args:
        df: pyspark.sql.dataframe.DataFrame
        outputPath: path to save files
    Return:
        None
    """
    df.coalesce(1).write.csv(outputPath)


def main():
    """Main ETL script definition
    
    """
    filePath = '../files/part-00000.parquet'
    outPutPath = '../out/out_2_3.txt'
    
    spark = startSpark()
    df = extractDF(spark, filePath)
    dfPrep = transformDF(df)
    loadDF(dfPrep, outPutPath)

    
if __name__ == '__main__':
    main()

In [6]:
#task2_4
from pyspark.sql import SparkSession
import pyspark.sql.functions as f


def startSpark(name="spark-etl"):
    """Start Spark Sessions
    
    Args:
        param name: Spark job name
    Return:
        return: SparkSession object
    """
    spark = (SparkSession
             .builder
             .appName(name)
             .getOrCreate())
    return spark


def extractDF(spark, filepath):
    """Opens parquet file
    
    Args:
        spark: pyspark.sql.session.SparkSession
        filepath: path to downloaded parquet file
    Return:
        df: list of products, pyspark.sql.dataframe.DataFrame
    """
    df = spark.read.parquet(filepath)
    return df


def transformDF(df):
    """Transforms df into stats
    
    Args:
        df: raw dataframe; pyspark.sql.dataframe.DataFrame
    Return:
        df: prepared dataframe; pyspark.sql.dataframe.DataFrame
    """
    minPrice = df.select("price").rdd.min()[0]
    dfPeople = df.filter(df.price == minPrice).select("review_scores_value", 'beds')
    maxRating = df.select("review_scores_value").rdd.max()[0]
    dfPeople = dfPeople.filter(dfPeople.review_scores_value == maxRating).select("review_scores_value", 'beds')
    
    return dfPeople

    
def loadDF(df, outputPath):
    """Load DF into folder
    
    Args:
        df: pyspark.sql.dataframe.DataFrame
        outputPath: path to save files
    Return:
        None
    """
    df.coalesce(1).write.csv(outputPath)


def main():
    """Main ETL script definition
    
    """
    filePath = '../files/part-00000.parquet'
    outPutPath = '../out/out_2_4.txt'
    
    spark = startSpark()
    df = extractDF(spark, filePath)
    dfPrep = transformDF(df)
    loadDF(dfPrep, outPutPath)

    
if __name__ == '__main__':
    main()

In [7]:
#task3_2
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.types import StringType
from pyspark.ml.classification import LogisticRegression, OneVsRest
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import VectorAssembler


def startSpark(name="spark-etl"):
    """Start Spark Sessions
    
    Args:
        param name: Spark job name
    Return:
        return: SparkSession object
    """
    spark = (SparkSession
             .builder
             .appName(name)
             .getOrCreate())
    return spark


def extractDF(spark, filepath):
    """Opens parquet file
    
    Args:
        spark: pyspark.sql.session.SparkSession
        filepath: path to downloaded parquet file
    Return:
        df: list of products, pyspark.sql.dataframe.DataFrame
    """
    df = spark.read.csv(filepath)
    return df


def prepareModel(df):
    """Prepare model
    
    Args:
        df: pyspark.sql.dataframe.DataFrame
    Return:
        model: LinearRegression
    """
    
    mapping = {'Iris-virginica' : "1", "Iris-setosa" : "2", "Iris-versicolor" : "3"}
    dfIrisLabeled = df.withColumnRenamed('_c4', 'label')
    dfIrisLabeled = dfIrisLabeled.replace(to_replace=mapping, subset=['label'])
    dfIrisLabeled = dfIrisLabeled.withColumn("label", dfIrisLabeled.label.cast('int'))
    dfIrisInt = (dfIrisLabeled.withColumn("_c0", dfIrisLabeled._c0.cast('int'))
                     .withColumn("_c1", dfIrisLabeled._c1.cast('int'))
                     .withColumn("_c2", dfIrisLabeled._c2.cast('int'))
                     .withColumn("_c3", dfIrisLabeled._c3.cast('int')))
    assembler = VectorAssembler(inputCols = ['_c0', '_c1', '_c2', '_c3'], outputCol='features')
    output = assembler.transform(dfIrisInt)
    finalisedData = output.select('features', 'label')
    lr = LogisticRegression(maxIter=10, tol=1E-6, fitIntercept=True, labelCol='label', featuresCol='features')
    fitModel = lr.fit(finalisedData)
    return fitModel


def predictModel(spark, model):
    """Predict given results model
    
    Args:
        spark: pyspark.sql.session.SparkSession
        model: LinearRegression
    Return:
        df: dataframe with predictions, pyspark.sql.dataframe.DataFrame
    """
    predData = spark.createDataFrame(
        [(5.1, 3.5, 1.4, 0.2),
         (6.2, 3.4, 5.4, 2.3)],
        ["sepal_length", "sepal_width", "petal_length", "petal_width"])
    assembler = VectorAssembler(inputCols = ['sepal_length',
                                             'sepal_width',
                                             'petal_length',
                                             'petal_width'],
                                outputCol='features')
    predDataAcc = assembler.transform(predData)
    predFeatures = predDataAcc.select('features')
    predicts = model.transform(predFeatures)
    predictsPrepared = (predicts.withColumn("features", f.col('features').cast(StringType()))
                        .withColumn("rawPrediction", f.col('rawPrediction').cast(StringType()))
                        .withColumn("probability", f.col('probability').cast(StringType()))
                        .withColumn("prediction", f.col('prediction').cast(StringType())))
    return predictsPrepared


def loadDF(df, outPutPath):
    """Load DF into folder
    
    Args:
        df: pyspark.sql.dataframe.DataFrame
        outputPath: path to save files
    Return:
        None
    """
    df.coalesce(1).write.csv(outPutPath)
    

def main():
    """Main ETL script definition
    
    """
    filePath = '../files/iris.data'
    outPutPath = '../out/out_3_2.txt' 
    
    spark = startSpark()
    df = extractDF(spark, filePath)
    model = prepareModel(df)
    dfPred = predictModel(spark, model)
    loadDF(dfPred, outPutPath)
    

if __name__ == "__main__":
    main()