## Libraries

In [1]:
from pyspark.sql import SparkSession

In [3]:
#create a session
spark = SparkSession.builder.appName('Ml_8').getOrCreate()

## Read dataset

In [4]:
df = spark.read.csv('tips.csv', header=True, inferSchema=True)

In [5]:
df.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [6]:
df.printSchema()

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: integer (nullable = true)



## handling with categorical Features

In [7]:
from pyspark.ml.feature import StringIndexer

In [9]:
indexer = StringIndexer(inputCols=['sex', 'smoker', 'time'], outputCols=['sex_num', 'smoker_num', 'time_num'])

In [10]:
df_r =  indexer.fit(df).transform(df)

In [11]:
df_r.show()

+----------+----+------+------+---+------+----+-------+----------+--------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_num|smoker_num|time_num|
+----------+----+------+------+---+------+----+-------+----------+--------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|    1.0|       0.0|     0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|    0.0|       0.0|     0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|    0.0|       0.0|     0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|    0.0|       0.0|     0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|    1.0|       0.0|     0.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|    0.0|       0.0|     0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|    0.0|       0.0|     0.0|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|    0.0|       0.0|     0.0|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|    0.0|       0.0|     0.0|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|    0.0|       0.0|     0.0|
|     10.27|

## Select features and put all together Vector Asm

In [13]:
df_r.columns

['total_bill',
 'tip',
 'sex',
 'smoker',
 'day',
 'time',
 'size',
 'sex_num',
 'smoker_num',
 'time_num']

In [12]:
from pyspark.ml.feature import VectorAssembler

In [14]:
vector_a = VectorAssembler(inputCols=['tip', 'size', 'sex_num', 'smoker_num', 'time_num'],
                          outputCol='independet_fetures')
output = vector_a.transform(df_r)

In [15]:
output.show(5)

+----------+----+------+------+---+------+----+-------+----------+--------+--------------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_num|smoker_num|time_num|  independet_fetures|
+----------+----+------+------+---+------+----+-------+----------+--------+--------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|    1.0|       0.0|     0.0|[1.01,2.0,1.0,0.0...|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|    0.0|       0.0|     0.0|(5,[0,1],[1.66,3.0])|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|    0.0|       0.0|     0.0| (5,[0,1],[3.5,3.0])|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|    0.0|       0.0|     0.0|(5,[0,1],[3.31,2.0])|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|    1.0|       0.0|     0.0|[3.61,4.0,1.0,0.0...|
+----------+----+------+------+---+------+----+-------+----------+--------+--------------------+
only showing top 5 rows



In [18]:
finalized_data = output.select('independet_fetures', 'total_bill')

In [19]:
finalized_data.show(5)

+--------------------+----------+
|  independet_fetures|total_bill|
+--------------------+----------+
|[1.01,2.0,1.0,0.0...|     16.99|
|(5,[0,1],[1.66,3.0])|     10.34|
| (5,[0,1],[3.5,3.0])|     21.01|
|(5,[0,1],[3.31,2.0])|     23.68|
|[3.61,4.0,1.0,0.0...|     24.59|
+--------------------+----------+
only showing top 5 rows



## Train and test split

In [21]:
train_data, test_data = finalized_data.randomSplit([0.70, 0.30])

## Build Model

In [20]:
from pyspark.ml.regression import LinearRegression

In [22]:
model = LinearRegression(featuresCol='independet_fetures', labelCol= 'total_bill')
regressor = model.fit(train_data)

In [23]:
regressor.coefficients

DenseVector([3.1518, 3.6716, -1.2671, 2.1572, -1.5825])

In [24]:
regressor.intercept

0.8231175873348526

### Predictions

In [25]:
pred_results = regressor.evaluate(test_data)

In [26]:
pred_results.predictions.show()

+--------------------+----------+------------------+
|  independet_fetures|total_bill|        prediction|
+--------------------+----------+------------------+
|(5,[0,1],[1.25,2.0])|     10.51|12.105944001464891|
|(5,[0,1],[1.32,2.0])|      9.68| 12.32656737645623|
|(5,[0,1],[1.47,2.0])|     10.77|12.799331751437672|
| (5,[0,1],[1.5,2.0])|     12.46|12.893884626433959|
|(5,[0,1],[1.56,2.0])|      9.94|13.082990376426537|
|(5,[0,1],[1.66,3.0])|     10.34| 17.06972827105651|
|(5,[0,1],[1.75,2.0])|     17.82| 13.68182525140303|
| (5,[0,1],[2.0,2.0])|      8.77|  14.4697658763721|
| (5,[0,1],[2.0,4.0])|     19.77|21.812889165656788|
| (5,[0,1],[2.0,4.0])|     24.55|21.812889165656788|
|(5,[0,1],[2.34,4.0])|     17.81|22.884488415614722|
| (5,[0,1],[2.5,2.0])|     14.07| 16.04564712631024|
| (5,[0,1],[3.0,2.0])|     18.04|17.621528376248378|
|(5,[0,1],[3.15,3.0])|     20.08|21.765854395872164|
|(5,[0,1],[3.48,3.0])|     24.52|22.805936020831336|
|(5,[0,1],[3.51,2.0])|     19.49|19.2289272511

### Evaluatin Score

In [27]:
pred_results.r2

0.4439171497021679

In [28]:
pred_results.meanAbsoluteError

4.287457542312645

In [29]:
pred_results.meanSquaredError

35.75276206852547