In [155]:
sc.stop()

In [156]:
from pyspark import SparkContext, SparkConf
config = SparkConf().setAppName("PysparkSession").setMaster("local[4]")
sc = SparkContext(conf = config)

In [157]:
# spark session ()- 
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PysparkSession").getOrCreate()

In [158]:
sc

In [159]:
spark

#### Problem statement : We have provided with online e-commerce website dataset with various input features -like duration, product related duration,region, month bonus rates etc 
* Target : Revenue (yes/no)

In [160]:
from pyspark.sql.functions import *

#### 1.Read Dataset

In [3]:
online_shopper = spark.read.csv("file:///home/hadoop/Downloads/Online Shoppers Intention.csv", inferSchema=True, header=True)
online_shopper.head()

KeyboardInterrupt: 

In [162]:
online_shopper.show()

+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+-----+----------------+-------+------+-----------+-----------------+-------+-------+
|Administrative|Administrative_Duration|Informational|Informational_Duration|ProductRelated|ProductRelated_Duration|BounceRates|  ExitRates|PageValues|SpecialDay|Month|OperatingSystems|Browser|Region|TrafficType|      VisitorType|Weekend|Revenue|
+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+-----+----------------+-------+------+-----------+-----------------+-------+-------+
|             0|                    0.0|            0|                   0.0|             1|                    0.0|        0.2|        0.2|       0.0|       0.0|  Feb|               1|      1|     1|          1|Returning_Visitor|  false|  false|
|           

#### Show schema of DataFrame

In [163]:
online_shopper.printSchema()

root
 |-- Administrative: integer (nullable = true)
 |-- Administrative_Duration: double (nullable = true)
 |-- Informational: integer (nullable = true)
 |-- Informational_Duration: double (nullable = true)
 |-- ProductRelated: integer (nullable = true)
 |-- ProductRelated_Duration: double (nullable = true)
 |-- BounceRates: double (nullable = true)
 |-- ExitRates: double (nullable = true)
 |-- PageValues: double (nullable = true)
 |-- SpecialDay: double (nullable = true)
 |-- Month: string (nullable = true)
 |-- OperatingSystems: integer (nullable = true)
 |-- Browser: integer (nullable = true)
 |-- Region: integer (nullable = true)
 |-- TrafficType: integer (nullable = true)
 |-- VisitorType: string (nullable = true)
 |-- Weekend: boolean (nullable = true)
 |-- Revenue: boolean (nullable = true)



#### 3. Data wrangling
* Is there any missing Values?

In [164]:
online_shopper.createOrReplaceTempView("shopping")

In [165]:
online_shopper.select([count(when(isnull(col),col)).alias(col) for col in online_shopper.columns]).show()

+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+---------+----------+----------+-----+----------------+-------+------+-----------+-----------+-------+-------+
|Administrative|Administrative_Duration|Informational|Informational_Duration|ProductRelated|ProductRelated_Duration|BounceRates|ExitRates|PageValues|SpecialDay|Month|OperatingSystems|Browser|Region|TrafficType|VisitorType|Weekend|Revenue|
+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+---------+----------+----------+-----+----------------+-------+------+-----------+-----------+-------+-------+
|            14|                     14|           14|                    14|            14|                     14|         14|       14|         0|         0|    0|               0|      0|     0|          0|          0|      0|      0|
+--------------+-----------------------+----

In [166]:
#Drop rows with any null vallues in any column (apply only if affect less than 5% of data)
shoppers_df = online_shopper.na.drop()

In [167]:
shoppers_df.count()

12316

In [168]:
!pip install numpy  pandas  matplotlib seaborn

Defaulting to user installation because normal site-packages is not writeable


#### 4. Data Preprocessing
* Transformation of categorical values into neumerical dataframe

In [169]:
shoppers_df.select('VisitorType').distinct().show()

+-----------------+
|      VisitorType|
+-----------------+
|      New_Visitor|
|            Other|
|Returning_Visitor|
+-----------------+



In [170]:
shoppers_df.toPandas().head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend,Revenue
0,0,0.0,0,0.0,1,0.0,0.2,0.2,0.0,0.0,Feb,1,1,1,1,Returning_Visitor,False,False
1,0,0.0,0,0.0,2,64.0,0.0,0.1,0.0,0.0,Feb,2,2,1,2,Returning_Visitor,False,False
2,0,-1.0,0,-1.0,1,-1.0,0.2,0.2,0.0,0.0,Feb,4,1,9,3,Returning_Visitor,False,False
3,0,0.0,0,0.0,2,2.666667,0.05,0.14,0.0,0.0,Feb,3,2,2,4,Returning_Visitor,False,False
4,0,0.0,0,0.0,10,627.5,0.02,0.05,0.0,0.0,Feb,3,3,1,4,Returning_Visitor,True,False


In [171]:
shoppers_df = shoppers_df.withColumn('VisitorType',regexp_replace('VisitorType','New_Visitor',"0") )
shoppers_df = shoppers_df.withColumn('VisitorType',regexp_replace('VisitorType','Other',"1") )
shoppers_df = shoppers_df.withColumn('VisitorType',regexp_replace('VisitorType','Returning_Visitor',"2") )

* StringIndexer() - Encode categorical values into numerical labeels as '0', '1', '2'

In [172]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

In [173]:
indexer = StringIndexer(inputCol = 'Month', outputCol = 'Month_index')
#fit StringIndexer() model on the DataFrameindexer
shoppers_df1 = indexer.fit(shoppers_df).transform(shoppers_df)
shoppers_df1.show()

+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+-----+----------------+-------+------+-----------+-----------+-------+-------+-----------+
|Administrative|Administrative_Duration|Informational|Informational_Duration|ProductRelated|ProductRelated_Duration|BounceRates|  ExitRates|PageValues|SpecialDay|Month|OperatingSystems|Browser|Region|TrafficType|VisitorType|Weekend|Revenue|Month_index|
+--------------+-----------------------+-------------+----------------------+--------------+-----------------------+-----------+-----------+----------+----------+-----+----------------+-------+------+-----------+-----------+-------+-------+-----------+
|             0|                    0.0|            0|                   0.0|             1|                    0.0|        0.2|        0.2|       0.0|       0.0|  Feb|               1|      1|     1|          1|          2|  false|  false| 

In [174]:
shoppers_df1.printSchema()

root
 |-- Administrative: integer (nullable = true)
 |-- Administrative_Duration: double (nullable = true)
 |-- Informational: integer (nullable = true)
 |-- Informational_Duration: double (nullable = true)
 |-- ProductRelated: integer (nullable = true)
 |-- ProductRelated_Duration: double (nullable = true)
 |-- BounceRates: double (nullable = true)
 |-- ExitRates: double (nullable = true)
 |-- PageValues: double (nullable = true)
 |-- SpecialDay: double (nullable = true)
 |-- Month: string (nullable = true)
 |-- OperatingSystems: integer (nullable = true)
 |-- Browser: integer (nullable = true)
 |-- Region: integer (nullable = true)
 |-- TrafficType: integer (nullable = true)
 |-- VisitorType: string (nullable = true)
 |-- Weekend: boolean (nullable = true)
 |-- Revenue: boolean (nullable = true)
 |-- Month_index: double (nullable = false)



In [175]:
shoppers_df2 = shoppers_df1.drop('Month')

In [176]:
shoppers_df2.printSchema()

root
 |-- Administrative: integer (nullable = true)
 |-- Administrative_Duration: double (nullable = true)
 |-- Informational: integer (nullable = true)
 |-- Informational_Duration: double (nullable = true)
 |-- ProductRelated: integer (nullable = true)
 |-- ProductRelated_Duration: double (nullable = true)
 |-- BounceRates: double (nullable = true)
 |-- ExitRates: double (nullable = true)
 |-- PageValues: double (nullable = true)
 |-- SpecialDay: double (nullable = true)
 |-- OperatingSystems: integer (nullable = true)
 |-- Browser: integer (nullable = true)
 |-- Region: integer (nullable = true)
 |-- TrafficType: integer (nullable = true)
 |-- VisitorType: string (nullable = true)
 |-- Weekend: boolean (nullable = true)
 |-- Revenue: boolean (nullable = true)
 |-- Month_index: double (nullable = false)



In [177]:
from pyspark.sql.types import IntegerType
shoppers_df2 = shoppers_df2.withColumn('VisitorType', col('VisitorType').cast(IntegerType()))

In [178]:
print(shoppers_df2.columns)

['Administrative', 'Administrative_Duration', 'Informational', 'Informational_Duration', 'ProductRelated', 'ProductRelated_Duration', 'BounceRates', 'ExitRates', 'PageValues', 'SpecialDay', 'OperatingSystems', 'Browser', 'Region', 'TrafficType', 'VisitorType', 'Weekend', 'Revenue', 'Month_index']


In [179]:
shoppers_df3 = shoppers_df2.withColumn('Revenue', col('Revenue').cast('integer'))

In [2]:
shoppers_df3.printSchema()

NameError: name 'shoppers_df3' is not defined

#### 5. Features Vector

In [181]:
from pyspark.ml.feature import VectorAssembler

In [182]:
vector_assembler = VectorAssembler(inputCols=['Administrative','Administrative_Duration',
'Informational','Informational_Duration','ProductRelated','ProductRelated_Duration','BounceRates',
'ExitRates','PageValues','SpecialDay','OperatingSystems','Browser','Region','TrafficType','VisitorType',
'Weekend','Month_index'],outputCol='feature')

In [183]:
shoppers_df4 = vector_assembler.transform(shoppers_df3)
shoppers_df4.select(['feature']).show(truncate=False)

+---------------------------------------------------------------------------------------------------+
|feature                                                                                            |
+---------------------------------------------------------------------------------------------------+
|(17,[4,6,7,10,11,12,13,14,16],[1.0,0.2,0.2,1.0,1.0,1.0,1.0,2.0,9.0])                               |
|(17,[4,5,7,10,11,12,13,14,16],[2.0,64.0,0.1,2.0,2.0,1.0,2.0,2.0,9.0])                              |
|[0.0,-1.0,0.0,-1.0,1.0,-1.0,0.2,0.2,0.0,0.0,4.0,1.0,9.0,3.0,2.0,0.0,9.0]                           |
|(17,[4,5,6,7,10,11,12,13,14,16],[2.0,2.666666667,0.05,0.14,3.0,2.0,2.0,4.0,2.0,9.0])               |
|[0.0,0.0,0.0,0.0,10.0,627.5,0.02,0.05,0.0,0.0,3.0,3.0,1.0,4.0,2.0,1.0,9.0]                         |
|(17,[4,5,6,7,10,11,12,13,14,16],[19.0,154.2166667,0.015789474,0.024561404,2.0,2.0,1.0,3.0,2.0,9.0])|
|[0.0,-1.0,0.0,-1.0,1.0,-1.0,0.2,0.2,0.0,0.4,2.0,4.0,3.0,3.0,2.0,0.0,9.0]         

In [1]:
train, test = shoppers_df4.randomSplit([0.8, 0.2], seed = 234)

NameError: name 'shoppers_df4' is not defined

In [None]:
train.show()

In [None]:
train.select(['feature', 'Revenue'])

#### 7. Decision tree classifier

In [None]:
from pyspark.ml.classification import DecisionTreeClassifier
tree = DecisionTreeClassifier(featuresCol = 'feature', labelCol = 'Revenue')
decision_model = tree.fit(train)


In [188]:
#prediction on test data
predictions = decision_model.transform(test)

In [189]:
predictions.select(['feature','Revenue','prediction']).show()

+--------------------+-------+----------+
|             feature|Revenue|prediction|
+--------------------+-------+----------+
|[0.0,-1.0,0.0,-1....|      0|       0.0|
|[0.0,-1.0,0.0,-1....|      0|       0.0|
|[0.0,-1.0,0.0,-1....|      0|       0.0|
|[0.0,-1.0,0.0,-1....|      0|       0.0|
|[0.0,-1.0,0.0,-1....|      0|       0.0|
|(17,[6,7,10,11,12...|      0|       0.0|
|(17,[4,7,9,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
|(17,[4,6,7,10,11,...|      0|       0.0|
+--------------------+-------+----

#### 8. Classification Metrics

In [190]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="Revenue", predictionCol="prediction",metricName="accuracy")
accuracy = evaluator.evaluate(predictions)

In [191]:
accuracy

0.883921246923708