In [1]:
import os
import sys
os.environ["SPARK_HOME"] = "/usr/hdp/current/spark2-client"
os.environ["PYLIB"] = os.environ["SPARK_HOME"] + "/python/lib"
sys.path.insert(0, os.environ["PYLIB"] + "/py4j-0.10.4-src.zip")
sys.path.insert(0, os.environ["PYLIB"] + "/pyspark.zip")

In [2]:
from pyspark.conf import SparkConf
from pyspark import SparkContext
from pyspark.sql import SparkSession

conf = SparkConf().setAppName("Income Data Set").setMaster('local')
sc = SparkContext(conf=conf)
spark = SparkSession(sc)

In [3]:
sc

In [4]:
from pyspark.sql.types import *
from pyspark.sql.functions import *

In [5]:
## Define Schema
IncomeDataSchema = StructType([
    StructField("age", IntegerType(), True),
    StructField("workclass", StringType(), True),
    StructField("fnlwgt", FloatType(), True),
    StructField("education", StringType(), True),
    StructField("education_num", FloatType(), True),
    StructField("maritalstatus", StringType(), True),
    StructField("occupation", StringType(), True),
    StructField("relationship", StringType(), True),        
    StructField("race", StringType(), True),
    StructField("sex", StringType(), True),
    StructField("capitalgain", FloatType(), True),
    StructField("capitalloss", FloatType(), True),
    StructField("hoursperweek", FloatType(), True),
    StructField("nativecountry", StringType(), True),
    StructField("class", StringType(), True)])

In [34]:
## Read data and create a dataframe
incomeDF = spark.read.csv(header=None,nullValue="?",
                         ignoreLeadingWhiteSpace=True,ignoreTrailingWhiteSpace=True,schema=IncomeDataSchema,                                                  
                         path="file:///home/2019B42/Lab04Spark/20180721_Batch42_CSE7321c_Lab03_Incomedata_PC.csv") 

In [35]:
incomeDF.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: float (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: float (nullable = true)
 |-- maritalstatus: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capitalgain: float (nullable = true)
 |-- capitalloss: float (nullable = true)
 |-- hoursperweek: float (nullable = true)
 |-- nativecountry: string (nullable = true)
 |-- class: string (nullable = true)



In [8]:
print("No. of Columns = {}".format(len(incomeDF.columns)))

print('No. of Records = {}'.format(incomeDF.count()))

No. of Columns = 15
No. of Records = 32561


In [36]:
## count null values
incomeDF.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in incomeDF.columns]).show(truncate=False)

+---+---------+------+---------+-------------+-------------+----------+------------+----+---+-----------+-----------+------------+-------------+-----+
|age|workclass|fnlwgt|education|education_num|maritalstatus|occupation|relationship|race|sex|capitalgain|capitalloss|hoursperweek|nativecountry|class|
+---+---------+------+---------+-------------+-------------+----------+------------+----+---+-----------+-----------+------------+-------------+-----+
|0  |1836     |0     |0        |0            |0            |1843      |0           |0   |0  |0          |0          |0           |583          |0    |
+---+---------+------+---------+-------------+-------------+----------+------------+----+---+-----------+-----------+------------+-------------+-----+



In [10]:
## drop the rows with NA values
incomeDF = incomeDF.dropna()

In [11]:
## verify
incomeDF.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in incomeDF.columns]).show(truncate=False)

+---+---------+------+---------+-------------+-------------+----------+------------+----+---+-----------+-----------+------------+-------------+-----+
|age|workclass|fnlwgt|education|education_num|maritalstatus|occupation|relationship|race|sex|capitalgain|capitalloss|hoursperweek|nativecountry|class|
+---+---------+------+---------+-------------+-------------+----------+------------+----+---+-----------+-----------+------------+-------------+-----+
|0  |0        |0     |0        |0            |0            |0         |0           |0   |0  |0          |0          |0           |0            |0    |
+---+---------+------+---------+-------------+-------------+----------+------------+----+---+-----------+-----------+------------+-------------+-----+



In [12]:
## group by gender and maritalstatus
incomeDF.groupBy("sex", "maritalstatus").count().orderBy(incomeDF.sex).show()

+------+--------------------+-----+
|   sex|       maritalstatus|count|
+------+--------------------+-----+
|Female|   Married-AF-spouse|   12|
|Female|           Separated|  574|
|Female|             Widowed|  686|
|Female|            Divorced| 2529|
|Female|  Married-civ-spouse| 1480|
|Female|       Never-married| 4312|
|Female|Married-spouse-ab...|  189|
|  Male|   Married-AF-spouse|    9|
|  Male|             Widowed|  141|
|  Male|            Divorced| 1685|
|  Male|Married-spouse-ab...|  181|
|  Male|  Married-civ-spouse|12585|
|  Male|           Separated|  365|
|  Male|       Never-married| 5414|
+------+--------------------+-----+



In [13]:
## column references
incomeDF.select(expr("class AS target")).show(5)

+------+
|target|
+------+
| <=50K|
| <=50K|
| <=50K|
| <=50K|
| <=50K|
+------+
only showing top 5 rows



In [14]:
## adding a new column total gain = capitalgain+capitalloss
incomeDF = incomeDF.withColumn("totalgain", lit(1))
incomeDF = incomeDF.withColumn("totalgain", incomeDF["capitalgain"] + incomeDF["capitalloss"])
#tempDF = trainDF.withColumn("SameCategoryCode", trainDF["Product_Category_1"] == trainDF["Product_Category_2"])
incomeDF.show(4, truncate = False)

+---+----------------+--------+---------+-------------+------------------+-----------------+-------------+-----+----+-----------+-----------+------------+-------------+-----+---------+
|age|workclass       |fnlwgt  |education|education_num|maritalstatus     |occupation       |relationship |race |sex |capitalgain|capitalloss|hoursperweek|nativecountry|class|totalgain|
+---+----------------+--------+---------+-------------+------------------+-----------------+-------------+-----+----+-----------+-----------+------------+-------------+-----+---------+
|39 |State-gov       |77516.0 |Bachelors|13.0         |Never-married     |Adm-clerical     |Not-in-family|White|Male|2174.0     |0.0        |40.0        |United-States|<=50K|2174.0   |
|50 |Self-emp-not-inc|83311.0 |Bachelors|13.0         |Married-civ-spouse|Exec-managerial  |Husband      |White|Male|0.0        |0.0        |13.0        |United-States|<=50K|0.0      |
|38 |Private         |215646.0|HS-grad  |9.0          |Divorced          |H

In [15]:
## remove the capitalgain and capitalloss columns
incomeDF = incomeDF.drop("capitalgain","capitalloss")
incomeDF.show(4, truncate = False)

+---+----------------+--------+---------+-------------+------------------+-----------------+-------------+-----+----+------------+-------------+-----+---------+
|age|workclass       |fnlwgt  |education|education_num|maritalstatus     |occupation       |relationship |race |sex |hoursperweek|nativecountry|class|totalgain|
+---+----------------+--------+---------+-------------+------------------+-----------------+-------------+-----+----+------------+-------------+-----+---------+
|39 |State-gov       |77516.0 |Bachelors|13.0         |Never-married     |Adm-clerical     |Not-in-family|White|Male|40.0        |United-States|<=50K|2174.0   |
|50 |Self-emp-not-inc|83311.0 |Bachelors|13.0         |Married-civ-spouse|Exec-managerial  |Husband      |White|Male|13.0        |United-States|<=50K|0.0      |
|38 |Private         |215646.0|HS-grad  |9.0          |Divorced          |Handlers-cleaners|Not-in-family|White|Male|40.0        |United-States|<=50K|0.0      |
|53 |Private         |234721.0|11t

In [16]:
## find distinct values of education
print("Distinct values in education in the dataset are {}".format(incomeDF.select('education').distinct().count()))

Distinct values in education in the dataset are 16


In [17]:
## Pairwise frequencies
incomeDF.crosstab('sex', 'relationship').show()

+----------------+-------+-------------+--------------+---------+---------+----+
|sex_relationship|Husband|Not-in-family|Other-relative|Own-child|Unmarried|Wife|
+----------------+-------+-------------+--------------+---------+---------+----+
|            Male|  12462|         4160|           503|     2505|      749|   1|
|          Female|      1|         3566|           386|     1961|     2463|1405|
+----------------+-------+-------------+--------------+---------+---------+----+



In [18]:
## min and max fnlwgt
incomeDF.groupBy("sex").max("fnlwgt").show()
incomeDF.groupBy("sex").min("fnlwgt").show()

+------+-----------+
|   sex|max(fnlwgt)|
+------+-----------+
|Female|  1484705.0|
|  Male|  1455435.0|
+------+-----------+

+------+-----------+
|   sex|min(fnlwgt)|
+------+-----------+
|Female|    19395.0|
|  Male|    13769.0|
+------+-----------+



In [19]:
## sum of total gain
incomeDF.groupBy("sex").sum("totalgain").show()

+------+--------------+
|   sex|sum(totalgain)|
+------+--------------+
|Female|     6173572.0|
|  Male|    2.942906E7|
+------+--------------+



In [20]:
## avg fnlwgt
incomeDF.groupBy("race").avg("fnlwgt").show()

+------------------+------------------+
|              race|       avg(fnlwgt)|
+------------------+------------------+
|             Other| 195772.2380952381|
|Amer-Indian-Eskimo|123491.94405594406|
|             White| 187285.0021594108|
|Asian-Pac-Islander|          159337.4|
|             Black| 228807.4554490593|
+------------------+------------------+



In [21]:
## variance and standard deviation
from pyspark.sql.functions import var_pop, stddev_pop
from pyspark.sql.functions import var_samp, stddev_samp

incomeDF.select(var_pop("fnlwgt"), var_samp("fnlwgt"),
  stddev_pop("fnlwgt"), stddev_samp("fnlwgt")).show()

+--------------------+--------------------+------------------+-------------------+
|     var_pop(fnlwgt)|    var_samp(fnlwgt)|stddev_pop(fnlwgt)|stddev_samp(fnlwgt)|
+--------------------+--------------------+------------------+-------------------+
|1.116218030626028...|1.116255039280602...|105651.22008883893| 105652.97152851892|
+--------------------+--------------------+------------------+-------------------+



In [22]:
## skewness and kutosis
from pyspark.sql.functions import skewness, kurtosis
incomeDF.select(skewness("totalgain"), kurtosis("totalgain")).show()

+-------------------+-------------------+
|skewness(totalgain)|kurtosis(totalgain)|
+-------------------+-------------------+
| 11.876149814824785| 153.24179273163477|
+-------------------+-------------------+



In [23]:
## Covariance and correlation
from pyspark.sql.functions import corr, covar_pop, covar_samp
incomeDF.select(corr("fnlwgt", "age"), covar_samp("fnlwgt", "age"),
    covar_pop("fnlwgt", "age")).show()

+--------------------+-----------------------+----------------------+
|   corr(fnlwgt, age)|covar_samp(fnlwgt, age)|covar_pop(fnlwgt, age)|
+--------------------+-----------------------+----------------------+
|-0.07651083605759339|    -106175.33919793477|   -106171.81902887444|
+--------------------+-----------------------+----------------------+



In [24]:
## grouping with agg
incomeDF.groupBy("sex").agg(expr("avg(fnlwgt)"),expr("stddev_pop(totalgain)")).show()

+------+------------------+---------------------+
|   sex|       avg(fnlwgt)|stddev_pop(totalgain)|
+------+------------------+---------------------+
|Female|185926.15048047435|    4923.948589689943|
|  Male|191650.24607458292|    8323.749165218294|
+------+------------------+---------------------+



In [25]:
## user defined function
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

Function1 = udf(lambda x: 'less' if x <= 1000 else "high", StringType())

In [26]:
incomeDF = incomeDF.withColumn("New_udf_col",Function1(incomeDF["totalgain"]))
incomeDF.show(5)

+---+----------------+--------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+-------------+-----+---------+-----------+
|age|       workclass|  fnlwgt|education|education_num|     maritalstatus|       occupation| relationship| race|   sex|hoursperweek|nativecountry|class|totalgain|New_udf_col|
+---+----------------+--------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+-------------+-----+---------+-----------+
| 39|       State-gov| 77516.0|Bachelors|         13.0|     Never-married|     Adm-clerical|Not-in-family|White|  Male|        40.0|United-States|<=50K|   2174.0|       high|
| 50|Self-emp-not-inc| 83311.0|Bachelors|         13.0|Married-civ-spouse|  Exec-managerial|      Husband|White|  Male|        13.0|United-States|<=50K|      0.0|       less|
| 38|         Private|215646.0|  HS-grad|          9.0|          Divorced|Handlers-cleaners|Not-in-family|White|  Male|      

In [27]:
incomeDF.show(5)

+---+----------------+--------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+-------------+-----+---------+-----------+
|age|       workclass|  fnlwgt|education|education_num|     maritalstatus|       occupation| relationship| race|   sex|hoursperweek|nativecountry|class|totalgain|New_udf_col|
+---+----------------+--------+---------+-------------+------------------+-----------------+-------------+-----+------+------------+-------------+-----+---------+-----------+
| 39|       State-gov| 77516.0|Bachelors|         13.0|     Never-married|     Adm-clerical|Not-in-family|White|  Male|        40.0|United-States|<=50K|   2174.0|       high|
| 50|Self-emp-not-inc| 83311.0|Bachelors|         13.0|Married-civ-spouse|  Exec-managerial|      Husband|White|  Male|        13.0|United-States|<=50K|      0.0|       less|
| 38|         Private|215646.0|  HS-grad|          9.0|          Divorced|Handlers-cleaners|Not-in-family|White|  Male|      

In [29]:
## Multiple ways of referring a column in a dataframe
from pyspark.sql.functions import expr, col, column

incomeDF.select(expr("education_num AS education") , col("education_num"), column("education_num"), "education_num").show(2)

+---------+-------------+-------------+-------------+
|education|education_num|education_num|education_num|
+---------+-------------+-------------+-------------+
|     13.0|         13.0|         13.0|         13.0|
|     13.0|         13.0|         13.0|         13.0|
+---------+-------------+-------------+-------------+
only showing top 2 rows



In [30]:
incomeDF.selectExpr("education_num AS education_num", "totalgain AS totalgain").show(2)

+-------------+---------+
|education_num|totalgain|
+-------------+---------+
|         13.0|   2174.0|
|         13.0|      0.0|
+-------------+---------+
only showing top 2 rows



In [31]:
##renaming columns
incomeDF = incomeDF.withColumnRenamed("education_num", "education").show(2)

+---+----------------+-------+---------+---------+------------------+---------------+-------------+-----+----+------------+-------------+-----+---------+-----------+
|age|       workclass| fnlwgt|education|education|     maritalstatus|     occupation| relationship| race| sex|hoursperweek|nativecountry|class|totalgain|New_udf_col|
+---+----------------+-------+---------+---------+------------------+---------------+-------------+-----+----+------------+-------------+-----+---------+-----------+
| 39|       State-gov|77516.0|Bachelors|     13.0|     Never-married|   Adm-clerical|Not-in-family|White|Male|        40.0|United-States|<=50K|   2174.0|       high|
| 50|Self-emp-not-inc|83311.0|Bachelors|     13.0|Married-civ-spouse|Exec-managerial|      Husband|White|Male|        13.0|United-States|<=50K|      0.0|       less|
+---+----------------+-------+---------+---------+------------------+---------------+-------------+-----+----+------------+-------------+-----+---------+-----------+
only

In [None]:
## differences in columns
## From the above we can see the train file has more categories than test file. 
## Let us check what are the categories for Product_ID, which are in test file but not in train file by 
## applying subtract operation.
## We can do the same for all categorical features.
diff_cat_in_test_train=testDF.select('Product_ID').subtract(trainDF.select('Product_ID'))
print("Count of Product_ID's there in test dataset but not train dataset are {}".format(diff_cat_in_test_train.count()))

diff_cat_in_train_test=trainDF.select('Product_ID').subtract(testDF.select('Product_ID'))
print("Count of Product_ID's there in train dataset but not test dataset are {}".format(diff_cat_in_train_test.count()))

In [37]:
incomeDF.show(2)

+---+----------------+-------+---------+-------------+------------------+---------------+-------------+-----+----+-----------+-----------+------------+-------------+-----+
|age|       workclass| fnlwgt|education|education_num|     maritalstatus|     occupation| relationship| race| sex|capitalgain|capitalloss|hoursperweek|nativecountry|class|
+---+----------------+-------+---------+-------------+------------------+---------------+-------------+-----+----+-----------+-----------+------------+-------------+-----+
| 39|       State-gov|77516.0|Bachelors|         13.0|     Never-married|   Adm-clerical|Not-in-family|White|Male|     2174.0|        0.0|        40.0|United-States|<=50K|
| 50|Self-emp-not-inc|83311.0|Bachelors|         13.0|Married-civ-spouse|Exec-managerial|      Husband|White|Male|        0.0|        0.0|        13.0|United-States|<=50K|
+---+----------------+-------+---------+-------------+------------------+---------------+-------------+-----+----+-----------+-----------+--

In [42]:
## Filling with different values for different columns
## imputation
fill_cols_vals = {
"sex": 'Male',
"hoursperweek" : 999 ,
"workclass" : 'Unknown' ,
"occupation" : 'Unknown' ,
"nativecountry" : 'Unknown'
}
#incomeDF.na.fill(fill_cols_vals).count()
incomeimputedDF = incomeDF.na.fill(fill_cols_vals)

In [44]:
## verify
incomeimputedDF.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in incomeimputedDF.columns]).show(truncate=False)

+---+---------+------+---------+-------------+-------------+----------+------------+----+---+-----------+-----------+------------+-------------+-----+
|age|workclass|fnlwgt|education|education_num|maritalstatus|occupation|relationship|race|sex|capitalgain|capitalloss|hoursperweek|nativecountry|class|
+---+---------+------+---------+-------------+-------------+----------+------------+----+---+-----------+-----------+------------+-------------+-----+
|0  |0        |0     |0        |0            |0            |0         |0           |0   |0  |0          |0          |0           |0            |0    |
+---+---------+------+---------+-------------+-------------+----------+------------+----+---+-----------+-----------+------------+-------------+-----+



In [47]:
## filtering rows
incomeDF.where("fnlwgt > 0").where("sex = 'Female'").count()
incomeDF.filter("fnlwgt > 0").where("sex = 'Male'").count()
incomeDF.where((col("fnlwgt") > 0) & (col("sex") == 'Female')).count()
incomeDF.filter((col("fnlwgt") > 0) & (col("sex") == 'Male')).count()

21790

In [48]:
## first and last
incomeDF.select(first("maritalstatus"), last("relationship")).show()

+---------------------------+-------------------------+
|first(maritalstatus, false)|last(relationship, false)|
+---------------------------+-------------------------+
|              Never-married|                     Wife|
+---------------------------+-------------------------+



In [49]:
## min and max
incomeDF.select(min("education_num"), max("hoursperweek")).show()

+------------------+-----------------+
|min(education_num)|max(hoursperweek)|
+------------------+-----------------+
|               1.0|             99.0|
+------------------+-----------------+



In [50]:
## sum distinct
incomeDF.select(sumDistinct("fnlwgt")).show()

+--------------------+
|sum(DISTINCT fnlwgt)|
+--------------------+
|       4.254454715E9|
+--------------------+



In [51]:
## average
incomeDF.select(
    count("education_num").alias("total_yrs_of_edu"),
    sum("education_num").alias("sum_yrs_of_edu"),
    avg("education_num").alias("avg_yrs_of_edu"),
    expr("mean(education_num)").alias("mean_yrs_of_edu"))\
  .selectExpr(
    "sum_yrs_of_edu/total_yrs_of_edu",
    "avg_yrs_of_edu",
    "mean_yrs_of_edu").show()

+-----------------------------------+----------------+----------------+
|(sum_yrs_of_edu / total_yrs_of_edu)|  avg_yrs_of_edu| mean_yrs_of_edu|
+-----------------------------------+----------------+----------------+
|                   10.0806793403151|10.0806793403151|10.0806793403151|
+-----------------------------------+----------------+----------------+



In [52]:
## complex aggregations
incomeDF.agg(collect_set("hoursperweek"), collect_list("hoursperweek")).show()

+-------------------------+--------------------------+
|collect_set(hoursperweek)|collect_list(hoursperweek)|
+-------------------------+--------------------------+
|     [98.0, 85.0, 43.0...|      [40.0, 13.0, 40.0...|
+-------------------------+--------------------------+



In [53]:
## grouping with agg expressions and maps
incomeDF.groupBy("education").agg(
  count("hoursperweek").alias("hours"),
  expr("count(hoursperweek)")).show()

+------------+-----+-------------------+
|   education|hours|count(hoursperweek)|
+------------+-----+-------------------+
|     Masters| 1723|               1723|
|        10th|  933|                933|
|     5th-6th|  333|                333|
|  Assoc-acdm| 1067|               1067|
|   Assoc-voc| 1382|               1382|
|     7th-8th|  646|                646|
|         9th|  514|                514|
|     HS-grad|10501|              10501|
|   Bachelors| 5355|               5355|
|        11th| 1175|               1175|
|     1st-4th|  168|                168|
|   Preschool|   51|                 51|
|        12th|  433|                433|
|   Doctorate|  413|                413|
|Some-college| 7291|               7291|
| Prof-school|  576|                576|
+------------+-----+-------------------+



In [54]:
## grouping with maps
incomeDF.groupBy("maritalstatus").agg(expr("avg(fnlwgt)"),expr("stddev_pop(fnlwgt)")).show()

+--------------------+------------------+------------------+
|       maritalstatus|       avg(fnlwgt)|stddev_pop(fnlwgt)|
+--------------------+------------------+------------------+
|           Separated|206444.32585365855|117543.94573012274|
|       Never-married| 195383.3089956005|107760.31075328245|
|Married-spouse-ab...|193141.76076555025|114800.03124474503|
|            Divorced|185802.79045689848|105634.89966249929|
|             Widowed| 174159.0422960725| 93958.03336565649|
|   Married-AF-spouse| 188165.5652173913|136794.38604035907|
|  Married-civ-spouse|186763.17287660256|103094.10086160518|
+--------------------+------------------+------------------+



In [55]:
## To find the mean sum  of each age group in train dataset - Average purchases in each age group
incomeDF.groupby('maritalstatus').agg({'fnlwgt': 'mean'}).show()
incomeDF.groupby('maritalstatus').agg({'fnlwgt': 'sum'}).show()


+--------------------+------------------+
|       maritalstatus|       avg(fnlwgt)|
+--------------------+------------------+
|           Separated|206444.32585365855|
|       Never-married| 195383.3089956005|
|Married-spouse-ab...|193141.76076555025|
|            Divorced|185802.79045689848|
|             Widowed| 174159.0422960725|
|   Married-AF-spouse| 188165.5652173913|
|  Married-civ-spouse|186763.17287660256|
+--------------------+------------------+

+--------------------+-------------+
|       maritalstatus|  sum(fnlwgt)|
+--------------------+-------------+
|           Separated| 2.11605434E8|
|       Never-married| 2.08727989E9|
|Married-spouse-ab...|  8.0733256E7|
|            Divorced| 8.25521798E8|
|             Widowed| 1.72939929E8|
|   Married-AF-spouse|    4327808.0|
|  Married-civ-spouse|2.796965277E9|
+--------------------+-------------+



In [56]:
## Apply sum, min, max for colummns count with groupby to get different summary insight for each group. 
exprs = {x: "sum" for x in incomeDF.columns}
incomeDF.groupBy("maritalstatus").agg(exprs).show()

+--------------------+------------------+------------------+----------------+------------------+-----------------+--------+--------+-----------------+--------------+---------+----------------+-------------+----------+--------------+---------------+
|       maritalstatus|sum(maritalstatus)|sum(nativecountry)|sum(capitalgain)|sum(education_num)|sum(hoursperweek)|sum(age)|sum(sex)|sum(relationship)|sum(education)|sum(race)|sum(capitalloss)|  sum(fnlwgt)|sum(class)|sum(workclass)|sum(occupation)|
+--------------------+------------------+------------------+----------------+------------------+-----------------+--------+--------+-----------------+--------------+---------+----------------+-------------+----------+--------------+---------------+
|           Separated|              null|              null|        548958.0|            9526.0|          40284.0|   40338|    null|             null|          null|     null|         59563.0| 2.11605434E8|      null|          null|           null|
|   

In [None]:
## joins
joinType = "inner"
person.join(graduateProgram, joinExpression, joinType).show()
joinType = "outer"
person.join(graduateProgram, joinExpression, joinType).show()
joinType = "left_outer"
graduateProgram.join(person, joinExpression, joinType).show()
joinType = "right_outer"
person.join(graduateProgram, joinExpression, joinType).show()
person.crossJoin(graduateProgram).show()
person.withColumnRenamed("id", "personId")\
  .join(roleStatus, expr("array_contains(role_status, id)")).show()

In [58]:
## sorting 
incomeDF.orderBy(incomeDF.age.desc()).show(5)

+---+---------+--------+------------+-------------+-------------+-----------------+--------------+------------------+------+-----------+-----------+------------+-------------+-----+
|age|workclass|  fnlwgt|   education|education_num|maritalstatus|       occupation|  relationship|              race|   sex|capitalgain|capitalloss|hoursperweek|nativecountry|class|
+---+---------+--------+------------+-------------+-------------+-----------------+--------------+------------------+------+-----------+-----------+------------+-------------+-----+
| 90|  Private|313986.0|        11th|          7.0|Never-married|Handlers-cleaners|     Own-child|             White|  Male|        0.0|        0.0|        40.0|United-States|<=50K|
| 90|  Private| 51744.0|     Masters|         14.0|Never-married|  Exec-managerial| Not-in-family|             Black|  Male|        0.0|        0.0|        50.0|United-States| >50K|
| 90|     null|256514.0|   Bachelors|         13.0|      Widowed|             null|Other-r

In [59]:
## get partitions -- repartition and coalesce
incomeDF.rdd.getNumPartitions()
## Do the repartition
## trainDF.repartition(5)

## Repartition based on a column
## If we know we are going to be filtering by a certain column often, 
## it can be worth repartitioning based on that column.
## incomeDF.repartition(col(“hoursperweek”))

## We can optionally specify the number of partitions we would like too.
## incomeDF.repartition(5, col(“hoursperweek”))

## Coalesce on the other hand will not incur a full shuffle and will try to combine partitions. 
## This operation will shuffle our data into 5 partitions based on the Purchase, 
## then coalesce them (without a full shuffle).
## incomeDF.repartition(5, col("hoursperweek")).coalesce(2)

1

In [None]:
## union
df1 = df1.union(df2)

In [63]:
## derive calculated column
incomeDF.withColumn('fnlwgt_new', incomeDF.fnlwgt /2.0).select('fnlwgt','fnlwgt_new').show(5)

+--------+----------+
|  fnlwgt|fnlwgt_new|
+--------+----------+
| 77516.0|   38758.0|
| 83311.0|   41655.5|
|215646.0|  107823.0|
|234721.0|  117360.5|
|338409.0|  169204.5|
+--------+----------+
only showing top 5 rows



In [None]:
## To remove some categories of Product_ID column in test that are not present in Product_ID column in train
## Use an user defined function ( udf ) to remove the categories of a column which are in test but not in train.
## Calculate the categories in Product_ID column which are in test but not in train.
diff_cat_in_train_test=testDF.select('Product_ID').subtract(trainDF.select('Product_ID'))
diff_cat_in_train_test.count() # For distict count

In [None]:
## There are 46 different categories in test. 
## To remove these categories from the test ‘Product_ID’ column.

## Create the distinct list of categories called ‘not_found_cat’ from the diff_cat_in_train_test using map operation.
## Register a udf(user define function).
## User defined function will take each element of test column and search this in not_found_cat list and 
## it will put -1 ifit finds in this list otherwise it will do nothing.
not_found_cat = diff_cat_in_train_test.rdd.map(lambda x: x[0]).collect()
print(len(not_found_cat))
print(type(not_found_cat))
print(not_found_cat)

In [66]:
## ceiling and floor value for round
incomeDF.select(round("fnlwgt", 2), bround("fnlwgt",2)).show(2)

+----------------+-----------------+
|round(fnlwgt, 2)|bround(fnlwgt, 2)|
+----------------+-----------------+
|         77516.0|          77516.0|
|         83311.0|          83311.0|
+----------------+-----------------+
only showing top 2 rows



In [68]:
## sort frequent items
incomeDF.stat.freqItems(["maritalstatus"]).show(truncate = False)

+-----------------------------------------------------------------------------------------------------------+
|maritalstatus_freqItems                                                                                    |
+-----------------------------------------------------------------------------------------------------------+
|[Married-spouse-absent, Never-married, Married-civ-spouse, Widowed, Divorced, Separated, Married-AF-spouse]|
+-----------------------------------------------------------------------------------------------------------+



In [69]:
## regular expression
regex_string = "F|M"

incomeDF.select(
regexp_replace(col("sex"), regex_string, "MALE_OR_FEMALE")
.alias("sex_ENCODE"),
col("sex"))\
.show(2)

+-----------------+----+
|       sex_ENCODE| sex|
+-----------------+----+
|MALE_OR_FEMALEale|Male|
|MALE_OR_FEMALEale|Male|
+-----------------+----+
only showing top 2 rows



In [70]:
## date manipulation
dateDF = spark.range(10)\
.withColumn("today", current_date())\
.withColumn("now", current_timestamp())
dateDF.show()

+---+----------+--------------------+
| id|     today|                 now|
+---+----------+--------------------+
|  0|2018-08-17|2018-08-17 21:03:...|
|  1|2018-08-17|2018-08-17 21:03:...|
|  2|2018-08-17|2018-08-17 21:03:...|
|  3|2018-08-17|2018-08-17 21:03:...|
|  4|2018-08-17|2018-08-17 21:03:...|
|  5|2018-08-17|2018-08-17 21:03:...|
|  6|2018-08-17|2018-08-17 21:03:...|
|  7|2018-08-17|2018-08-17 21:03:...|
|  8|2018-08-17|2018-08-17 21:03:...|
|  9|2018-08-17|2018-08-17 21:03:...|
+---+----------+--------------------+



In [71]:
## date part
dateDF.select(date_sub(col("today"), 5),date_add(col("today"), 5)).show(1)

+------------------+------------------+
|date_sub(today, 5)|date_add(today, 5)|
+------------------+------------------+
|        2018-08-12|        2018-08-22|
+------------------+------------------+
only showing top 1 row



In [73]:
## date addition and subtraction
dateDF\
.withColumn("week_ago", date_sub(col("today"), 7))\
.select("today",datediff(col("week_ago"), col("today")))\
.show(1)

+----------+-------------------------+
|     today|datediff(week_ago, today)|
+----------+-------------------------+
|2018-08-17|                       -7|
+----------+-------------------------+
only showing top 1 row



In [74]:
## months between
dateDF\
.select(
to_date(lit("2017-01-01")).alias("start"),
to_date(lit("2018-02-18")).alias("end"))\
.select(months_between(col("start"), col("end")))\
.show(1)

+--------------------------+
|months_between(start, end)|
+--------------------------+
|               -13.5483871|
+--------------------------+
only showing top 1 row



In [75]:
## udf power ** 3
udfExampleDF = spark.range(5).toDF("num")

def power3(double_value):
    return double_value ** 3

power3(2.0)

8.0

In [78]:
## parallelize
from pyspark.sql import Row
myList = [('Alpha',25),('Beta',22),('Charlie',20),('Delta',22), ('Echo',21),('France',22),('Gamma',23)]
rdd = sc.parallelize(myList)
people = rdd.map(lambda x: Row(name=x[0], age=int(x[1])))
people.collect()

[Row(age=25, name='Alpha'),
 Row(age=22, name='Beta'),
 Row(age=20, name='Charlie'),
 Row(age=22, name='Delta'),
 Row(age=21, name='Echo'),
 Row(age=22, name='France'),
 Row(age=23, name='Gamma')]

In [79]:
## broadcast variables
my_collection = "Postgraduate Program in Big Data Analytics and Optimization"\
  .split(" ")
    
words = spark.sparkContext.parallelize(my_collection)
supplementalData = {"Postgraduate":1000, "Analytics":200, "Optimization": 400,
                    "Big":-300, "Data": 100, "Program":100}
suppBroadcast = spark.sparkContext.broadcast(supplementalData)
suppBroadcast.value

{'Analytics': 200,
 'Big': -300,
 'Data': 100,
 'Optimization': 400,
 'Postgraduate': 1000,
 'Program': 100}

In [80]:
words.map(lambda word: (word, suppBroadcast.value.get(word, 0)))\
  .sortBy(lambda wordPair: wordPair[1])\
  .collect()

[('Big', -300),
 ('in', 0),
 ('and', 0),
 ('Program', 100),
 ('Data', 100),
 ('Analytics', 200),
 ('Optimization', 400),
 ('Postgraduate', 1000)]

In [None]:
## accumulator
accIND = spark.sparkContext.accumulator(0)

In [81]:
def accINDFunc(each_row):
    countryCD = each_row["NationCode"]
    list_ctrys = ["IND", "SRI", "PAK", "BAN"]
    if countryCD in list_ctrys:
        accIND.add(each_row["Total"])

In [83]:
## call the func
#cwgDF.foreach(lambda each_row: accINDFunc(each_row))
# check the value
accIND