In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('adult_logReg').getOrCreate()

In [2]:
df = spark.read.csv('adult_income.csv', inferSchema = True, header=True)
df.show(3)

+---+---------+------+------------+---------------+--------------+---------------+-------------+-----+------+------------+------------+--------------+--------------+------+
|age|workclass|fnlwgt|   education|educational-num|marital-status|     occupation| relationship| race|   sex|capital-gain|capital-loss|hours-per-week|native-country|income|
+---+---------+------+------------+---------------+--------------+---------------+-------------+-----+------+------------+------------+--------------+--------------+------+
| 90|        ?| 77053|     HS-grad|              9|       Widowed|              ?|Not-in-family|White|Female|           0|        4356|            40| United-States| <=50K|
| 82|  Private|132870|     HS-grad|              9|       Widowed|Exec-managerial|Not-in-family|White|Female|           0|        4356|            18| United-States| <=50K|
| 66|        ?|186061|Some-college|             10|       Widowed|              ?|    Unmarried|Black|Female|           0|        4356|

In [8]:
from pyspark.sql.functions import expr

In [12]:
expr("3 + 5")

Column<b'(3 + 5)'>

In [3]:
cols = df.columns

In [4]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)



In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler

categoricalColumns = ["workclass", "educational-num", "marital-status", "occupation", "relationship", "race", "sex", "native-country"]
stages = []

for categoricalCol in categoricalColumns:
    
    stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index')
    encoder = OneHotEncoderEstimator(inputCols=[stringIndexer.getOutputCol()], outputCols=[categoricalCol + "classVec"])
    stages += [stringIndexer, encoder]

In [6]:
label_stringIdx = StringIndexer(inputCol = 'income', outputCol = 'label')
stages += [label_stringIdx]

In [7]:
numericCols = ["age", "fnlwgt", "educational-num", "capital-gain", "capital-loss", "hours-per-week"]
assemblerInputs = [c + "classVec" for c in categoricalColumns] + numericCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages += [assembler]

In [8]:
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df)
df = pipelineModel.transform(df)
selectedcols = ["label", "features"] + cols
df = df.select(selectedcols)
df.show(3)

+-----+--------------------+---+---------+------+------------+---------------+--------------+---------------+-------------+-----+------+------------+------------+--------------+--------------+------+
|label|            features|age|workclass|fnlwgt|   education|educational-num|marital-status|     occupation| relationship| race|   sex|capital-gain|capital-loss|hours-per-week|native-country|income|
+-----+--------------------+---+---------+------+------------+---------------+--------------+---------------+-------------+-----+------+------------+------------+--------------+--------------+------+
|  0.0|(100,[3,8,27,36,4...| 90|        ?| 77053|     HS-grad|              9|       Widowed|              ?|Not-in-family|White|Female|           0|        4356|            40| United-States| <=50K|
|  0.0|(100,[0,8,27,31,4...| 82|  Private|132870|     HS-grad|              9|       Widowed|Exec-managerial|Not-in-family|White|Female|           0|        4356|            18| United-States| <=50K|


In [42]:
display(df)

DataFrame[label: double, features: vector, age: int, workclass: string, fnlwgt: int, education: string, educational-num: int, marital-status: string, occupation: string, relationship: string, race: string, sex: string, capital-gain: int, capital-loss: int, hours-per-week: int, native-country: string, income: string]

In [10]:
train, test = df.randomSplit([0.7, 0.3], seed=100)
print(train.count())
print(test.count())

22838
9723


### Logistic Regression

In [11]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol = 'label', featuresCol = 'features', maxIter=10)
lrModel = lr.fit(train)

In [12]:
predictions = lrModel.transform(test)
predictions.take(1)

[Row(label=0.0, features=SparseVector(100, {0: 1.0, 8: 1.0, 23: 1.0, 29: 1.0, 43: 1.0, 48: 1.0, 52: 1.0, 53: 1.0, 94: 26.0, 95: 58426.0, 96: 9.0, 99: 50.0}), age=26, workclass='Private', fnlwgt=58426, education='HS-grad', educational-num=9, marital-status='Married-civ-spouse', occupation='Prof-specialty', relationship='Husband', race='White', sex='Male', capital-gain=0, capital-loss=0, hours-per-week=50, native-country='United-States', income='<=50K', rawPrediction=DenseVector([0.8109, -0.8109]), probability=DenseVector([0.6923, 0.3077]), prediction=0.0)]

In [13]:
predictions.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [14]:
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

DataFrame[label: double, prediction: double, probability: vector, age: int, occupation: string]

In [15]:
selected.show(4)

+-----+----------+--------------------+---+--------------+
|label|prediction|         probability|age|    occupation|
+-----+----------+--------------------+---+--------------+
|  0.0|       0.0|[0.69229838711683...| 26|Prof-specialty|
|  0.0|       0.0|[0.62111209766645...| 30|Prof-specialty|
|  0.0|       0.0|[0.65841222002366...| 31|Prof-specialty|
|  0.0|       0.0|[0.65822715645265...| 32|Prof-specialty|
+-----+----------+--------------------+---+--------------+
only showing top 4 rows



In [16]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction")
print('Area Under ROC', evaluator.evaluate(predictions))

Area Under ROC 0.9014257227172442


In [17]:
evaluator.getMetricName()

'areaUnderROC'

In [18]:
print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features, current: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The

In [19]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

# Create ParamGrid for Cross Validation

paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.01, 0.5, 2.0])
             .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])
             .addGrid(lr.maxIter, [1, 5, 10])
             .build())

In [20]:
cv = CrossValidator(estimator=lr, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations
cvModel = cv.fit(train)

In [21]:
predictions = cvModel.transform(test)
print('Area Under ROC', evaluator.evaluate(predictions))

Area Under ROC 0.8991704438013846


### Decision Trees

In [22]:
from pyspark.ml.classification import DecisionTreeClassifier

dt = DecisionTreeClassifier(featuresCol = 'features', labelCol = 'label', maxDepth = 3)
dtModel = dt.fit(train)

In [23]:
print("numNodes = ", dtModel.numNodes)
print("depth = ", dtModel.depth)

numNodes =  11
depth =  3


In [24]:
predictions = dtModel.transform(test)
predictions.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [25]:
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

DataFrame[label: double, prediction: double, probability: vector, age: int, occupation: string]

In [26]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

0.7340070481047476

In [27]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = (ParamGridBuilder()
             .addGrid(dt.maxDepth, [1, 2, 6, 10])
             .addGrid(dt.maxBins, [20, 40, 80])
             .build())

In [28]:
# Create 5-fold CrossValidator
cv = CrossValidator(estimator=dt, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations
cvModel = cv.fit(train)

In [29]:
print("numNodes = ", cvModel.bestModel.numNodes)
print("depth = ", cvModel.bestModel.depth)

numNodes =  455
depth =  10


In [30]:
predictions = cvModel.transform(test)
evaluator.evaluate(predictions)

0.7671296163653554

In [31]:
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

DataFrame[label: double, prediction: double, probability: vector, age: int, occupation: string]

In [32]:
selected.show(3)

+-----+----------+-----------+---+--------------+
|label|prediction|probability|age|    occupation|
+-----+----------+-----------+---+--------------+
|  0.0|       1.0|  [0.4,0.6]| 26|Prof-specialty|
|  0.0|       0.0|[0.75,0.25]| 30|Prof-specialty|
|  0.0|       0.0|[0.75,0.25]| 31|Prof-specialty|
+-----+----------+-----------+---+--------------+
only showing top 3 rows



### Random Forest

In [33]:
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(featuresCol = 'features', labelCol = 'label')
rfModel = rf.fit(train)
predictions = rfModel.transform(test)
predictions.printSchema()

root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- educational-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)



In [34]:
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

DataFrame[label: double, prediction: double, probability: vector, age: int, occupation: string]

In [35]:
selected.show(3)

+-----+----------+--------------------+---+--------------+
|label|prediction|         probability|age|    occupation|
+-----+----------+--------------------+---+--------------+
|  0.0|       0.0|[0.68554302724600...| 26|Prof-specialty|
|  0.0|       0.0|[0.65173332787567...| 30|Prof-specialty|
|  0.0|       0.0|[0.65173332787567...| 31|Prof-specialty|
+-----+----------+--------------------+---+--------------+
only showing top 3 rows



In [36]:
evaluator = BinaryClassificationEvaluator()
evaluator.evaluate(predictions)

0.8897433744950282

In [37]:
paramGrid = (ParamGridBuilder()
             .addGrid(rf.maxDepth, [2, 4, 6])
             .addGrid(rf.maxBins, [20, 60])
             .addGrid(rf.numTrees, [5, 20])
             .build())

# Create 5-fold CrossValidator
cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# Run cross validations.  This can take about 6 minutes since it is training over 20 trees!
cvModel = cv.fit(train)

In [38]:
predictions = cvModel.transform(test)
evaluator.evaluate(predictions)

0.8955635618714743

In [39]:
selected = predictions.select("label", "prediction", "probability", "age", "occupation")
display(selected)

DataFrame[label: double, prediction: double, probability: vector, age: int, occupation: string]

In [40]:
selected.show(3)

+-----+----------+--------------------+---+--------------+
|label|prediction|         probability|age|    occupation|
+-----+----------+--------------------+---+--------------+
|  0.0|       0.0|[0.68814211202995...| 26|Prof-specialty|
|  0.0|       0.0|[0.66209560473596...| 30|Prof-specialty|
|  0.0|       0.0|[0.66209560473596...| 31|Prof-specialty|
+-----+----------+--------------------+---+--------------+
only showing top 3 rows



### Make Predictions

In [41]:
bestModel = cvModel.bestModel
final_predictions = bestModel.transform(df)
evaluator.evaluate(final_predictions)

0.9003037772574465

# Previous task related to EDA of retail dataset

### 5. Identify the customer ID, which bought the least number of products.

In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("").getOrCreate()

from pyspark.sql.functions import sum, asc

In [2]:
df = spark.read.format("csv")\
    .option("header","true")\
    .option("inferschema","true")\
    .load("online-retail-dataset.csv")

df.show(5)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
only showing top 5 rows



In [3]:
df.take(1)

[Row(InvoiceNo='536365', StockCode='85123A', Description='WHITE HANGING HEART T-LIGHT HOLDER', Quantity=6, InvoiceDate='12/1/2010 8:26', UnitPrice=2.55, CustomerID=17850, Country='United Kingdom')]

In [40]:
df.select("CustomerID","Quantity").groupBy("CustomerID").sum().sort(asc("sum(Quantity)")).show()

+----------+---------------+-------------+
|CustomerID|sum(CustomerID)|sum(Quantity)|
+----------+---------------+-------------+
|     16546|         512926|         -303|
|     15823|         268991|         -283|
|     14213|          71065|         -244|
|     16742|          66968|         -189|
|     16252|         341292|         -158|
|     17307|          17307|         -144|
|     17548|         298316|         -132|
|     18256|          73024|          -70|
|     12666|          25332|          -56|
|     15638|          31276|          -52|
|     15728|          62912|          -34|
|     17603|         264045|          -31|
|     13958|          69790|          -23|
|     16579|          16579|          -12|
|     13829|          13829|          -12|
|     18141|          18141|          -12|
|     14777|          44331|           -9|
|     13693|          54772|           -6|
|     14627|          73135|           -5|
|     16262|          48786|           -5|
+----------

In [4]:
df.filter("Quantity < 1").show()

+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|    InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+---------------+---------+----------+--------------+
|  C536379|        D|            Discount|      -1| 12/1/2010 9:41|     27.5|     14527|United Kingdom|
|  C536383|   35004C|SET OF 3 COLOURED...|      -1| 12/1/2010 9:49|     4.65|     15311|United Kingdom|
|  C536391|    22556|PLASTERS IN TIN C...|     -12|12/1/2010 10:24|     1.65|     17548|United Kingdom|
|  C536391|    21984|PACK OF 12 PINK P...|     -24|12/1/2010 10:24|     0.29|     17548|United Kingdom|
|  C536391|    21983|PACK OF 12 BLUE P...|     -24|12/1/2010 10:24|     0.29|     17548|United Kingdom|
|  C536391|    21980|PACK OF 12 RED RE...|     -24|12/1/2010 10:24|     0.29|     17548|United Kingdom|
|  C536391|    21484|CHICK GREY HOT WA...|     -12|12/1/2010 10:

In [5]:
df.filter("Quantity < 1").count()

10624

In [6]:
non_neg_df = df.filter("Quantity !< 0")
non_neg_df.show()

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
|   536365|   84029G|KNITTED UNION FLA...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|    22752|SET 7 BABUSHKA NE...|       2|12/1/2010 8:26|     7.65|     17850|United Kingdom|
|   536365|    21730|GLASS STAR FROSTE...|       6|12/1/2010 8:26|     4.

In [7]:
non_neg_df.select("CustomerID","Quantity").groupBy("CustomerID").sum().sort(asc("sum(Quantity)")).show()

+----------+---------------+-------------+
|CustomerID|sum(CustomerID)|sum(Quantity)|
+----------+---------------+-------------+
|     15823|          15823|            1|
|     16742|          16742|            1|
|     17956|          17956|            1|
|     15802|          15802|            1|
|     17846|          17846|            1|
|     13452|          26904|            2|
|     15510|          15510|            2|
|     14792|          29584|            2|
|     16257|          32514|            2|
|     17102|          17102|            2|
|     18268|          18268|            2|
|     16454|          32908|            3|
|     16738|          16738|            3|
|     17408|          34816|            3|
|     16162|          32324|            4|
|     13391|          13391|            4|
|     13307|          13307|            4|
|     18233|          18233|            4|
|     12789|          51156|            4|
|     16765|          16765|            4|
+----------