In [1]:
import pyspark
print(pyspark.__version__)

3.5.7


In [28]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.regression import LinearRegression

In [29]:
spark = SparkSession.builder.appName('pipeline_building').getOrCreate()
spark


In [30]:
df1 = spark.read.csv('happyscore_income.csv',header=True,inferSchema=True)
df1.show(5)
df1.printSchema()

+---------+---------------------+----------------+----------------+------------------+------------------+------------------+--------------------+----------+------------------+---------+
| country0|adjusted_satisfaction|avg_satisfaction|std_satisfaction|        avg_income|     median_income| income_inequality|              region|happyScore|               GDP|country10|
+---------+---------------------+----------------+----------------+------------------+------------------+------------------+--------------------+----------+------------------+---------+
|  Armenia|                 37.0|             4.9|            2.42|2096.7599999999998|1731.5066666666667|31.445555555555554|'Central and East...|      4.35|0.7682100000000001|  Armenia|
|   Angola|                 26.0|             4.3|            3.19|           1448.88|           1044.24|             42.72|'Sub-Saharan Africa'|     4.033|           0.75778|   Angola|
|Argentina|                 60.0|             7.1|            1.91|   

In [31]:
for cols in df1.columns:
    print(cols,' :',df1.filter(col(cols).isNull()).count())

country0  : 0
adjusted_satisfaction  : 0
avg_satisfaction  : 0
std_satisfaction  : 0
avg_income  : 0
median_income  : 0
income_inequality  : 0
region  : 0
happyScore  : 0
GDP  : 0
country10  : 0


In [32]:
df1=df1.drop('country10')

In [33]:
assembler = VectorAssembler(inputCols=['adjusted_satisfaction',
                                      'avg_satisfaction',
                                      'std_satisfaction',
                                      'avg_income',
                                      'median_income',
                                      'income_inequality',
                                      'GDP'],
                           outputCol='Features')
assembled_df = assembler.transform(df1)

In [34]:
assembled_df.show(2)

+--------+---------------------+----------------+----------------+------------------+------------------+------------------+--------------------+----------+------------------+--------------------+
|country0|adjusted_satisfaction|avg_satisfaction|std_satisfaction|        avg_income|     median_income| income_inequality|              region|happyScore|               GDP|            Features|
+--------+---------------------+----------------+----------------+------------------+------------------+------------------+--------------------+----------+------------------+--------------------+
| Armenia|                 37.0|             4.9|            2.42|2096.7599999999998|1731.5066666666667|31.445555555555554|'Central and East...|      4.35|0.7682100000000001|[37.0,4.9,2.42,20...|
|  Angola|                 26.0|             4.3|            3.19|           1448.88|           1044.24|             42.72|'Sub-Saharan Africa'|     4.033|           0.75778|[26.0,4.3,3.19,14...|
+--------+----------

In [35]:
model_df = assembled_df.select('Features','happyScore')
model_df.show(5)

+--------------------+----------+
|            Features|happyScore|
+--------------------+----------+
|[37.0,4.9,2.42,20...|      4.35|
|[26.0,4.3,3.19,14...|     4.033|
|[60.0,7.1,1.91,71...|     6.574|
|[59.0,7.2,2.11,19...|       7.2|
|[65.0,7.6,1.8,199...|     7.284|
+--------------------+----------+
only showing top 5 rows



In [36]:
train_df,test_df= model_df.randomSplit(weights=[0.8,0.2],seed=10)

### Linear Regression

In [37]:
lr = LinearRegression(featuresCol='Features',labelCol='happyScore')
lr_model = lr.fit(train_df)
lr_test_preds = lr_model.transform(test_df)
lr_train_preds = lr_model.transform(train_df)

In [38]:
lr_train_preds.show(5)
lr_test_preds.show(5)

+--------------------+----------+------------------+
|            Features|happyScore|        prediction|
+--------------------+----------+------------------+
|[19.0,2.5,2.26,94...|     3.781| 3.091301968923626|
|[22.0,2.6,2.08,93...|     2.839| 3.265054222951578|
|[26.0,4.3,3.19,14...|     4.033|3.7427627532601218|
|[27.0,3.5,2.36,85...|     4.507| 3.646746297902087|
|[32.0,4.7,2.9,903...|     3.995| 3.910158568125093|
+--------------------+----------+------------------+
only showing top 5 rows

+--------------------+------------------+------------------+
|            Features|        happyScore|        prediction|
+--------------------+------------------+------------------+
|[20.0,3.0,2.7,989...|              3.34| 3.188822305644581|
|[25.0,2.9,1.96,57...|             2.905| 3.354657679457719|
|[30.0,3.7,2.14,14...|             4.419| 3.806453099914407|
|[33.0,4.2,2.25,10...|             4.512|3.9225381987282586|
|[34.0,4.2,2.21,25...|3.8960000000000004| 4.431764541786329|
+---------

### Evaluation

In [39]:
from pyspark.ml.evaluation import RegressionEvaluator

In [43]:
mse_evaluator = RegressionEvaluator(labelCol='happyScore',predictionCol='prediction',metricName='mse')
rmse_evaluator = RegressionEvaluator(labelCol='happyScore',predictionCol='prediction',metricName='rmse')
r2_evaluator  = RegressionEvaluator(labelCol='happyScore',predictionCol='prediction',metricName='r2')


def myfun(model_pred):
    print('mean squared error :',mse_evaluator.evaluate(model_pred))
    print('root mean sqaured error :',rmse_evaluator.evaluate(model_pred))
    print('r2 :',r2_evaluator.evaluate(model_pred))

In [44]:
print('Linear Regression Train Score')
myfun(lr_train_preds)

Linear Regression Train Score
mean squared error : 0.1801599522508689
root mean sqaured error : 0.424452532388333
r2 : 0.8758159247278909


In [45]:
print('Linear Regression Test Score')
myfun(lr_test_preds)

Linear Regression Test Score
mean squared error : 0.30218008481740244
root mean sqaured error : 0.5497090910812759
r2 : 0.7260726317087502


### Ridge Regressoin

In [None]:
from pyspark.ml.regression import Ri

In [47]:
ridge_lr = LinearRegression(maxIter=10, regParam=0.1, elasticNetParam=0.0,featuresCol='Features',labelCol='happyScore')
# Fit model here:
ridge_model = ridge_lr.fit(train_df)
rm_train_preds = ridge_model.transform(train_df)
rm_test_preds = ridge_model	.transform(test_df)


In [48]:
print('Linear Regression Train Score')
myfun(rm_train_preds)

Linear Regression Train Score
mean squared error : 0.18336033814271782
root mean sqaured error : 0.4282059529510511
r2 : 0.8736099019268871


In [49]:
print('Linear Regression Test Score')
myfun(rm_test_preds)

Linear Regression Test Score
mean squared error : 0.305523987420018
root mean sqaured error : 0.5527422432020354
r2 : 0.723041371590102
