In [None]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425344 sha256=1363f20b5eff5ec53bc1130548ea1e6d9e5e8bace1b127251f3318153f28c387
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
from pyspark.sql.functions import col,lit,when
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import SparkSession
from pyspark.ml.regression import LinearRegression
from pyspark.ml.regression import RandomForestRegressor
spark=SparkSession.builder.appName('MySession').getOrCreate()

In [None]:
df=spark.read.csv("/content/Sample - Superstore.csv",header=True,inferSchema=True)

In [None]:
df.columns
df=df.na.drop()
df.columns

['rowid',
 'orderid',
 'orderdata',
 'shipdate',
 'shipmode',
 'customerid',
 'customername',
 'segment',
 'country',
 'city',
 'state',
 'postalcode',
 'region',
 'productid',
 'category',
 'subcategory',
 'productname',
 'sales',
 'quantity',
 'discount',
 'profit']

In [None]:
selection_df = df.select('region', 'subcategory', 'category', 'quantity', 'sales')
selection_df.printSchema()

root
 |-- region: string (nullable = true)
 |-- subcategory: string (nullable = true)
 |-- category: string (nullable = true)
 |-- quantity: string (nullable = true)
 |-- sales: string (nullable = true)



In [None]:
selection_df = selection_df.withColumn('sales', col('sales').cast('double'))
selection_df = selection_df.withColumn('quantity', col('quantity').cast('double'))

In [None]:
selection_df.show()

+-------+-----------+---------------+--------+--------+
| region|subcategory|       category|quantity|   sales|
+-------+-----------+---------------+--------+--------+
|  South|  Bookcases|      Furniture|     2.0|  261.96|
|  South|     Chairs|      Furniture|     3.0|  731.94|
|   West|     Labels|Office Supplies|     2.0|   14.62|
|  South|     Tables|      Furniture|     5.0|957.5775|
|  South|    Storage|Office Supplies|     2.0|  22.368|
|   West|Furnishings|      Furniture|     7.0|   48.86|
|   West|        Art|Office Supplies|     4.0|    7.28|
|   West|     Phones|     Technology|     6.0| 907.152|
|   West|    Binders|Office Supplies|     3.0|  18.504|
|   West| Appliances|Office Supplies|     5.0|   114.9|
|   West|     Tables|      Furniture|     9.0|1706.184|
|   West|     Phones|     Technology|     4.0| 911.424|
|  South|      Paper|Office Supplies|     3.0|  15.552|
|   West|    Binders|Office Supplies|     3.0| 407.976|
|Central| Appliances|Office Supplies|     5.0|  

In [None]:
selection_df = selection_df.na.drop()

indexer = StringIndexer (inputCols=["region", "subcategory"], outputCols=["region_indexed", "sub-Category_indexed"])
df_i = indexer.fit(selection_df).transform(selection_df)

df_i.show()


+-------+-----------+---------------+--------+--------+--------------+--------------------+
| region|subcategory|       category|quantity|   sales|region_indexed|sub-Category_indexed|
+-------+-----------+---------------+--------+--------+--------------+--------------------+
|  South|  Bookcases|      Furniture|     2.0|  261.96|           3.0|                12.0|
|  South|     Chairs|      Furniture|     3.0|  731.94|           3.0|                 7.0|
|   West|     Labels|Office Supplies|     2.0|   14.62|           0.0|                 9.0|
|  South|     Tables|      Furniture|     5.0|957.5775|           3.0|                10.0|
|  South|    Storage|Office Supplies|     2.0|  22.368|           3.0|                 4.0|
|   West|Furnishings|      Furniture|     7.0|   48.86|           0.0|                 2.0|
|   West|        Art|Office Supplies|     4.0|    7.28|           0.0|                 5.0|
|   West|     Phones|     Technology|     6.0| 907.152|           0.0|          

In [None]:
fa= VectorAssembler (inputCols=['region_indexed', 'sub-Category_indexed', 'quantity', 'sales' ], outputCol="Independent Features")
output = fa.transform(df_i)

In [None]:
final_data = output.select("Independent Features", "sales")
final_data.na.drop()
final_data.show()

+--------------------+--------+
|Independent Features|   sales|
+--------------------+--------+
|[3.0,12.0,2.0,261...|  261.96|
|[3.0,7.0,3.0,731.94]|  731.94|
| [0.0,9.0,2.0,14.62]|   14.62|
|[3.0,10.0,5.0,957...|957.5775|
|[3.0,4.0,2.0,22.368]|  22.368|
| [0.0,2.0,7.0,48.86]|   48.86|
|  [0.0,5.0,4.0,7.28]|    7.28|
|[0.0,3.0,6.0,907....| 907.152|
|[0.0,0.0,3.0,18.504]|  18.504|
| [0.0,8.0,5.0,114.9]|   114.9|
|[0.0,10.0,9.0,170...|1706.184|
|[0.0,3.0,4.0,911....| 911.424|
|[3.0,1.0,3.0,15.552]|  15.552|
|[0.0,0.0,3.0,407....| 407.976|
| [2.0,8.0,5.0,68.81]|   68.81|
| [2.0,0.0,3.0,2.544]|   2.544|
|[2.0,4.0,6.0,665.88]|  665.88|
|  [0.0,4.0,2.0,55.5]|    55.5|
|  [0.0,5.0,2.0,8.56]|    8.56|
|[0.0,3.0,3.0,213.48]|  213.48|
+--------------------+--------+
only showing top 20 rows



In [None]:
train_data, test_data = final_data.randomSplit([0.80, 0.20])
regressor=LinearRegression (featuresCol='Independent Features', labelCol='sales')
regressor = regressor.fit(train_data)

In [None]:
predict_results = regressor.evaluate(test_data)

predict_results. predictions.show()

+--------------------+------+----------+
|Independent Features| sales|prediction|
+--------------------+------+----------+
| [0.0,0.0,1.0,3.856]| 3.856|     3.856|
| [0.0,0.0,1.0,3.912]| 3.912|     3.912|
| [0.0,0.0,1.0,4.752]| 4.752|     4.752|
| [0.0,0.0,1.0,5.022]| 5.022|     5.022|
| [0.0,0.0,1.0,5.682]| 5.682|     5.682|
|[0.0,0.0,1.0,11.416]|11.416|    11.416|
| [0.0,0.0,2.0,5.984]| 5.984|     5.984|
| [0.0,0.0,2.0,6.368]| 6.368|     6.368|
| [0.0,0.0,2.0,7.712]| 7.712|     7.712|
| [0.0,0.0,2.0,8.288]| 8.288|     8.288|
| [0.0,0.0,2.0,9.728]| 9.728|     9.728|
| [0.0,0.0,2.0,9.762]| 9.762|     9.762|
|[0.0,0.0,2.0,12.672]|12.672|    12.672|
| [0.0,0.0,2.0,13.76]| 13.76|     13.76|
|[0.0,0.0,2.0,13.904]|13.904|    13.904|
|[0.0,0.0,2.0,17.456]|17.456|    17.456|
| [0.0,0.0,2.0,22.62]| 22.62|     22.62|
|[0.0,0.0,2.0,22.848]|22.848|    22.848|
|[0.0,0.0,2.0,24.448]|24.448|    24.448|
|[0.0,0.0,2.0,24.704]|24.704|    24.704|
+--------------------+------+----------+
only showing top

In [None]:
print("MAE is ", predict_results.meanAbsoluteError)
print("MSE is ", predict_results.meanSquaredError)
print("RMSE is ",predict_results.rootMeanSquaredError)
print("R2 is ",predict_results.r2)
print("Adj R2 is ", predict_results.r2adj)

MAE is  0.0
MSE is  0.0
RMSE is  0.0
R2 is  1.0
Adj R2 is  1.0
