# Importing Pyspark

# Importing Pyspark

In [1]:
import pyspark

In [None]:
from pyspark.sql import SparkSession

# Creating a Spark session

In [4]:
spark = SparkSession.builder.appName('linear_regression').getOrCreate()

In [5]:
spark

# Reading the dataframe

In [6]:
df = spark.read.csv('./tips.csv', inferSchema = True, header = True)
df.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [7]:
df.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



In [8]:
df.columns

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

# Encoding categorical features using StringIndexer

In [9]:
from pyspark.ml.feature import StringIndexer
transformer = StringIndexer(inputCols = ['sex', 'smoker', 'day', 'time'], 
                            outputCols = ['sex_ind', 'smoker_ind', 'day_ind', 'time_ind'], handleInvalid = 'skip').fit(df).transform(df)
transformer.show()

+----------+----+------+------+---+------+----+-------+----------+-------+--------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_ind|smoker_ind|day_ind|time_ind|
+----------+----+------+------+---+------+----+-------+----------+-------+--------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|    1.0|       0.0|    1.0|     0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|    0.0|       0.0|    1.0|     0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|    0.0|       0.0|    1.0|     0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|    0.0|       0.0|    1.0|     0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|    1.0|       0.0|    1.0|     0.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|    0.0|       0.0|    1.0|     0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|    0.0|       0.0|    1.0|     0.0|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|    0.0|       0.0|    1.0|     0.0|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|    0.0|       0.0|    1.0|  

In [10]:
transformer.columns

['total_bill',
 'tip',
 'sex',
 'smoker',
 'day',
 'time',
 'size',
 'sex_ind',
 'smoker_ind',
 'day_ind',
 'time_ind']

# Creating a column of independent features

In [11]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols = ['total_bill', 'sex_ind', 'smoker_ind', 'day_ind', 'time_ind', 'size'], 
                            outputCol = 'indep').transform(transformer)
assembler.show()

+----------+----+------+------+---+------+----+-------+----------+-------+--------+--------------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_ind|smoker_ind|day_ind|time_ind|               indep|
+----------+----+------+------+---+------+----+-------+----------+-------+--------+--------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|    1.0|       0.0|    1.0|     0.0|[16.99,1.0,0.0,1....|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|    0.0|       0.0|    1.0|     0.0|[10.34,0.0,0.0,1....|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|    0.0|       0.0|    1.0|     0.0|[21.01,0.0,0.0,1....|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|    0.0|       0.0|    1.0|     0.0|[23.68,0.0,0.0,1....|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|    1.0|       0.0|    1.0|     0.0|[24.59,1.0,0.0,1....|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|    0.0|       0.0|    1.0|     0.0|[25.29,0.0,0.0,1....|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|    0.0|

In [14]:
df_final = assembler.select(['tip', 'indep'])
df_final.show()

+----+--------------------+
| tip|               indep|
+----+--------------------+
|1.01|[16.99,1.0,0.0,1....|
|1.66|[10.34,0.0,0.0,1....|
| 3.5|[21.01,0.0,0.0,1....|
|3.31|[23.68,0.0,0.0,1....|
|3.61|[24.59,1.0,0.0,1....|
|4.71|[25.29,0.0,0.0,1....|
| 2.0|[8.77,0.0,0.0,1.0...|
|3.12|[26.88,0.0,0.0,1....|
|1.96|[15.04,0.0,0.0,1....|
|3.23|[14.78,0.0,0.0,1....|
|1.71|[10.27,0.0,0.0,1....|
| 5.0|[35.26,1.0,0.0,1....|
|1.57|[15.42,0.0,0.0,1....|
| 3.0|[18.43,0.0,0.0,1....|
|3.02|[14.83,1.0,0.0,1....|
|3.92|[21.58,0.0,0.0,1....|
|1.67|[10.33,1.0,0.0,1....|
|3.71|[16.29,0.0,0.0,1....|
| 3.5|[16.97,1.0,0.0,1....|
|3.35|(6,[0,5],[20.65,3...|
+----+--------------------+
only showing top 20 rows



# Performing linear regression

In [16]:
from pyspark.ml.regression import LinearRegression
train_data, test_data = df_final.randomSplit([0.75, 0.25])
model = LinearRegression(featuresCol = 'indep', labelCol = 'tip').fit(train_data)

In [17]:
model.coefficients

DenseVector([0.0886, 0.023, 0.0283, 0.0375, -0.0372, 0.1959])

In [18]:
model.intercept

0.6842383838747881

In [19]:
preds = model.evaluate(test_data)

In [21]:
preds.predictions.show()

+----+--------------------+------------------+
| tip|               indep|        prediction|
+----+--------------------+------------------+
| 1.0|[3.07,1.0,1.0,0.0...|1.2035825967683045|
|1.25|(6,[0,5],[10.51,2...|2.0077429634378663|
|1.32|[9.68,0.0,0.0,1.0...|1.9717075462497455|
|1.44|[7.74,0.0,1.0,0.0...| 1.790482506375319|
|1.45|(6,[0,5],[9.55,2.0])|1.9226463696854919|
|1.48|[8.52,0.0,0.0,2.0...| 1.869176429111954|
| 1.5|[10.65,1.0,0.0,2....| 2.080981139856706|
| 1.5|[12.03,0.0,1.0,3....|2.2833709481871014|
|1.63|[11.87,1.0,0.0,2....| 2.189124727750349|
|1.67|[10.33,1.0,0.0,1....|2.2482587562320346|
|1.75|(6,[0,5],[17.82,2...| 2.655718067948135|
| 1.8|[12.43,1.0,0.0,2....| 2.238764407439234|
| 2.0|(6,[0,5],[16.31,3...|2.7178052986309096|
| 2.0|[10.63,1.0,1.0,0....| 2.069655353840701|
| 2.0|[12.26,1.0,0.0,2....| 2.223695218962251|
| 2.0|[13.81,0.0,0.0,1....|2.3378001839552733|
| 2.0|[16.0,0.0,1.0,2.0...| 2.560499394926322|
| 2.0|[16.21,1.0,0.0,1....| 2.769475392965329|
| 2.0|[30.06,

In [22]:
preds.meanAbsoluteError

0.798571485237905

In [23]:
preds.meanSquaredError

1.1075997917294056

In [26]:
model.summary.r2

0.43496314308473716

In [27]:
preds.r2

0.5462250311434271