In [1]:
# Spark init
!wget -q https://dlcdn.apache.org/spark/spark-3.3.2/spark-3.3.2-bin-hadoop2.tgz
!tar xf spark-3.3.2-bin-hadoop2.tgz
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/default-java"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop2"
!pip install -q findspark
import findspark
findspark.init()

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('colab').getOrCreate()

In [4]:
from pyspark.sql.types import *

In [5]:
tips_df = spark.read.csv('/content/drive/MyDrive/data/tips.csv',inferSchema = True, header= True)

In [6]:
tips_df.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [8]:
import matplotlib.pyplot as mlt

In [6]:
import plotly.express as px

In [7]:
#creating a pandas dateframe to visualise the data
ptips_df = tips_df.toPandas()

In [9]:
figure = px.scatter(data_frame = ptips_df, x="total_bill",y="tip", size="size", color= "day",trendline='ols')
figure.show()

In [10]:
#cleansing the data 
gender = tips_df.select("sex").distinct()
gender.show()

+------+
|   sex|
+------+
|Female|
|  Male|
+------+



In [11]:
smokers = tips_df.select("smoker").distinct()
smokers.show()
#so the customers are either smokers or non-smokers.there are only two categories

+------+
|smoker|
+------+
|    No|
|   Yes|
+------+



In [12]:
days = tips_df.select("day").distinct()
days.show()
#so the restaurent is open only four days a week that is on thursday,sunday,staurday adn friday

+----+
| day|
+----+
|Thur|
| Sun|
| Sat|
| Fri|
+----+



In [13]:
timings = tips_df.select("time").distinct()
timings.show()
#so the restuarent serves only lunch and dinner

+------+
|  time|
+------+
| Lunch|
|Dinner|
+------+



In [14]:
#now checking the outliers in tips,total_bill amount and size
tips_df.describe().show()

+-------+------------------+------------------+------+------+----+------+------------------+
|summary|        total_bill|               tip|   sex|smoker| day|  time|              size|
+-------+------------------+------------------+------+------+----+------+------------------+
|  count|               244|               244|   244|   244| 244|   244|               244|
|   mean|19.785942622950824|2.9982786885245902|  null|  null|null|  null| 2.569672131147541|
| stddev| 8.902411954856857|1.3836381890011815|  null|  null|null|  null|0.9510998047322347|
|    min|              3.07|               1.0|Female|    No| Fri|Dinner|                 1|
|    max|             50.81|              10.0|  Male|   Yes|Thur| Lunch|                 6|
+-------+------------------+------------------+------+------+----+------+------------------+



In [15]:
fig = px.box(data_frame = ptips_df,y='total_bill')
fig.show()

In [8]:
#the above plot shows that there are some outliers in the total_bill column
from pyspark.sql.functions import *
tips_df = tips_df.filter(col('total_bill')<=40.17)

In [9]:
tips_df.describe().show()

+-------+------------------+------------------+------+------+----+------+-----------------+
|summary|        total_bill|               tip|   sex|smoker| day|  time|             size|
+-------+------------------+------------------+------+------+----+------+-----------------+
|  count|               235|               235|   235|   235| 235|   235|              235|
|   mean|18.798680851063832|2.9014893617021285|  null|  null|null|  null|2.523404255319149|
| stddev| 7.438468357146703|1.2256513682918746|  null|  null|null|  null|0.911984022473405|
|    min|              3.07|               1.0|Female|    No| Fri|Dinner|                1|
|    max|             40.17|              7.58|  Male|   Yes|Thur| Lunch|                6|
+-------+------------------+------------------+------+------+----+------+-----------------+



In [10]:
#for visualization we have to convert spark dataframe to pandas dataframe as the libraries support 
#pandas dataframe
ptips_df = tips_df.toPandas()

In [19]:
figure = px.pie(ptips_df,values = 'tip',names= 'sex')
figure.show()
#from the below figure we can say that more men have given tips to the waiters than women

In [20]:
figure = px.pie(ptips_df,values='tip',names = 'size')
figure.show()
#it is clear that waiters got more tips when the table size is 2

In [21]:
figure = px.pie(ptips_df,values='tip',names='time')
figure.show()
#more tips are collected at the dinner time rather than lunch time

In [33]:
tips_df.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [21]:
#now we have to convert string values into numerical variable in order to do regression analysis
#we can use stringindexer to convert into numerical values
from pyspark.ml.feature import StringIndexer,VectorAssembler
from pyspark.ml import Pipeline

#to use stringindexer on multiple columns, we have to use pipeline estimator
cols = ['sex','smoker','day','time']
indexers=[]
indexed_cols =[]
for col in cols:
    indexer = StringIndexer(inputCol=col, outputCol=col+ "_index")
    indexers.append(indexer)
pipeline = Pipeline(stages=indexers)
df_indexed = pipeline.fit(tips_df).transform(tips_df)
for col in cols:    
    indexed_cols.append(col+"_index")

indexed_cols.append('total_bill')

# Combine the indexed columns into a single feature vector column
assembler = VectorAssembler(inputCols=indexed_cols, outputCol="features")
df_indexed = assembler.transform(df_indexed)
df_indexed.show(truncate=False)


+----------+----+------+------+---+------+----+---------+------------+---------+----------+-----------------------+
|total_bill|tip |sex   |smoker|day|time  |size|sex_index|smoker_index|day_index|time_index|features               |
+----------+----+------+------+---+------+----+---------+------------+---------+----------+-----------------------+
|16.99     |1.01|Female|No    |Sun|Dinner|2   |1.0      |0.0         |1.0      |0.0       |[1.0,0.0,1.0,0.0,16.99]|
|10.34     |1.66|Male  |No    |Sun|Dinner|3   |0.0      |0.0         |1.0      |0.0       |(5,[2,4],[1.0,10.34])  |
|21.01     |3.5 |Male  |No    |Sun|Dinner|3   |0.0      |0.0         |1.0      |0.0       |(5,[2,4],[1.0,21.01])  |
|23.68     |3.31|Male  |No    |Sun|Dinner|2   |0.0      |0.0         |1.0      |0.0       |(5,[2,4],[1.0,23.68])  |
|24.59     |3.61|Female|No    |Sun|Dinner|4   |1.0      |0.0         |1.0      |0.0       |[1.0,0.0,1.0,0.0,24.59]|
|25.29     |4.71|Male  |No    |Sun|Dinner|4   |0.0      |0.0         |1.

In [22]:
#using the regression model to predict the tips
tips_model_df = df_indexed.select('features','tip')

In [23]:
(training,test)=tips_model_df.randomSplit([0.8,0.2],seed=1234)

In [24]:
from sklearn.linear_model import LinearRegression
from pyspark.ml.regression import LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator


# Define the linear regression model
lr = LinearRegression(featuresCol='features', labelCol='tip', maxIter=100, regParam=0.0)

# Train the linear regression model on the training data
lr_model = lr.fit(training)

# Make predictions on the test data
predictions = lr_model.transform(test)

# Evaluate the model using the Root Mean Squared Error (RMSE)
evaluator = RegressionEvaluator(labelCol='tip', predictionCol='prediction', metricName='rmse')
rmse = evaluator.evaluate(predictions)
print(f'Root Mean Squared Error (RMSE): {rmse}')


Root Mean Squared Error (RMSE): 0.983432600220609


In [29]:
#cols = ['sex','smoker','day','time','total_bill']
coefficients = lr_model.coefficients
#Print the coefficients for each feature
#for i in range(len(features)):
print(coefficients)

[0.19683026607806714,-0.171977687314607,0.11400759001808707,-0.22561357550255212,0.11001903303019442]


In [43]:
#giving input values to check the tip amount,sex=male,non-smoker,day=sun,time=lunch,bill= 20$
new_data = spark.createDataFrame([(0, 0, 1, 1,20)], ["sex_index", "smoker_index", "day_index", "time_index",'total_bill'])
new_data = assembler.transform(new_data)
predictions = lr_model.transform(new_data)

In [44]:
predictions.show()
#so we got expected tip amount as 2.81$

+---------+------------+---------+----------+----------+--------------------+-----------------+
|sex_index|smoker_index|day_index|time_index|total_bill|            features|       prediction|
+---------+------------+---------+----------+----------+--------------------+-----------------+
|        0|           0|        1|         1|        20|[0.0,0.0,1.0,1.0,...|2.872678340635776|
+---------+------------+---------+----------+----------+--------------------+-----------------+

