In [2]:
import pyspark
from pyspark.sql import SQLContext
from pyspark.sql import DataFrameNaFunctions
from pyspark.ml import Pipeline
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import Binarizer
from pyspark.ml.feature import VectorAssembler, StringIndexer, VectorIndexer

In [3]:
sc = pyspark.SparkContext()

sqlContext = SQLContext(sc)
df = sqlContext.read.load('file:///home/hadoop/Downloads/big-data-4/daily_weather.csv', 
                          format='com.databricks.spark.csv', 
                          header='true',inferSchema='true')
df.columns

23/04/10 17:28:21 WARN Utils: Your hostname, ubuntu resolves to a loopback address: 127.0.1.1; using 10.0.2.15 instead (on interface enp0s3)
23/04/10 17:28:21 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/04/10 17:28:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


                                                                                

['number',
 'air_pressure_9am',
 'air_temp_9am',
 'avg_wind_direction_9am',
 'avg_wind_speed_9am',
 'max_wind_direction_9am',
 'max_wind_speed_9am',
 'rain_accumulation_9am',
 'rain_duration_9am',
 'relative_humidity_9am',
 'relative_humidity_3pm']

 columns in the weather data we will use for the decision tree classifier

In [4]:
featureColumns = ['air_pressure_9am','air_temp_9am','avg_wind_direction_9am','avg_wind_speed_9am',
        'max_wind_direction_9am','max_wind_speed_9am','rain_accumulation_9am',
        'rain_duration_9am']

In [5]:
df = df.drop('number')

In [6]:
df = df.na.drop() 

In [7]:
df.count(), len(df.columns)

                                                                                

(1064, 10)

**Create categorical variable.** We create a categorical variable to denote if the humidity is not low. If the value is less than 25%, then we want the categorical value to be 0, otherwise the categorical value should be 1. We create this categorical variable as a column in a DataFrame using Binarizer:

In [8]:
binarizer = Binarizer(threshold=24.99999, inputCol="relative_humidity_3pm", outputCol="label")
binarizedDF = binarizer.transform(df)

The `threshold` argument specifies the threshold value for the variable, `inputCol` is the input column to read, and `outputCol` is the name of the new categorical column. The second line applies the `Binarizer` and creates a new DataFrame with the categorical column. We can look at the first four values in the new DataFrame:

In [9]:
binarizedDF.select("relative_humidity_3pm","label").show()

+---------------------+-----+
|relative_humidity_3pm|label|
+---------------------+-----+
|   36.160000000000494|  1.0|
|     19.4265967985621|  0.0|
|   14.460000000000045|  0.0|
|   12.742547353761848|  0.0|
|    76.74000000000046|  1.0|
|   33.930000000000256|  1.0|
|   21.385656725200974|  0.0|
|    74.92000000000041|  1.0|
|   24.030000000000427|  0.0|
|     68.0500000000012|  1.0|
|    32.13000000000024|  1.0|
|     79.0900000000002|  1.0|
|    58.43000000000119|  1.0|
|   27.990000000000173|  1.0|
|   24.369999999999948|  0.0|
|   14.801705962979918|  0.0|
|    20.75568332171184|  0.0|
|    45.87000000000005|  1.0|
|    7.740000000000088|  0.0|
|   14.649909361535952|  0.0|
+---------------------+-----+
only showing top 20 rows



The first row's humidity value is greater than 25% and the label is 1. The other humidity values are less than 25% and have labels equal to 0.

**Aggregate features.** Let's aggregate the features we will use to make predictions into a single column:

In [10]:
assembler = VectorAssembler(inputCols=featureColumns, outputCol="features")
assembled = assembler.transform(binarizedDF)

The `inputCols` argument specifies our list of column names we defined earlier, and `outputCol` is the name of the new column. The second line creates a new DataFrame with the aggregated features in a column.

In [11]:
assembled.show(1)

+-----------------+-----------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+-----+--------------------+
| air_pressure_9am|     air_temp_9am|avg_wind_direction_9am|avg_wind_speed_9am|max_wind_direction_9am|max_wind_speed_9am|rain_accumulation_9am|rain_duration_9am|relative_humidity_9am|relative_humidity_3pm|label|            features|
+-----------------+-----------------+----------------------+------------------+----------------------+------------------+---------------------+-----------------+---------------------+---------------------+-----+--------------------+
|918.0600000000087|74.82200000000041|                 271.1| 2.080354199999768|    295.39999999999986| 2.863283199999908|                  0.0|              0.0|    42.42000000000046|   36.160000000000494|  1.0|[918.060000000008...|
+-----------------+-----------------+----------------------+--------

In [12]:
assembled.toPandas()[:2]

[Stage 7:>                                                          (0 + 1) / 1]                                                                                

Unnamed: 0,air_pressure_9am,air_temp_9am,avg_wind_direction_9am,avg_wind_speed_9am,max_wind_direction_9am,max_wind_speed_9am,rain_accumulation_9am,rain_duration_9am,relative_humidity_9am,relative_humidity_3pm,label,features
0,918.06,74.822,271.1,2.080354,295.4,2.863283,0.0,0.0,42.42,36.16,1.0,"[918.0600000000087, 74.82200000000041, 271.1, ..."
1,917.347688,71.403843,101.935179,2.443009,140.471548,3.533324,0.0,0.0,24.328697,19.426597,0.0,"[917.3476881177097, 71.40384263106537, 101.935..."


Split training and test data by calling `randomSplit()`:

In [13]:
(trainingData, testData) = assembled.randomSplit([0.8,0.2], seed = 13234 )

In [14]:
trainingData.count(), testData.count()

                                                                                

(846, 218)

DecisionTreeClassifier

In [15]:
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features", maxDepth=5,
                            minInstancesPerNode=20, impurity="gini")

The `labelCol`argument is the column we are trying to predict, \
`featuresCol` specifies the aggregated features column, \
`maxDepth` is stopping criterion for tree induction based on maximum depth of tree, \
`minInstancesPerNode` is stopping criterion for tree induction based on minimum number of samples in a node, and \
`impurity` is the impurity measure used to split nodes.

In [16]:
pipeline = Pipeline(stages=[dt])
model = pipeline.fit(trainingData)

                                                                                

In [17]:
predictions = model.transform(testData)

In [18]:
predictions.select("prediction", "label").show(20)

+----------+-----+
|prediction|label|
+----------+-----+
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  0.0|
|       1.0|  1.0|
|       1.0|  1.0|
|       0.0|  1.0|
|       0.0|  1.0|
|       0.0|  1.0|
|       1.0|  1.0|
+----------+-----+
only showing top 20 rows



[Stage 28:>                                                         (0 + 1) / 1]                                                                                

save the predictions to a CSV file

In [22]:
predictions.select("prediction", "label")\
            .write\
            .save(path="file:///home/hadoop/big-data-processing/04_Machine_learning_with_Big_Data/predictions.csv",
                                                     format="com.databricks.spark.csv",
                                                     header='true')

                                                                                