In [1]:
import pyspark
print(pyspark.__version__)

3.5.7


In [2]:
import numpy as n

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Logistic_Regression_Model').getOrCreate()
spark

### Imports 

In [10]:
from pyspark.sql.functions import *
from pyspark.ml.feature import *

In [4]:
df_main = spark.read.csv('happyscore_income.csv', header=True, inferSchema=True)
df_main.show()


+------------+---------------------+----------------+----------------+------------------+------------------+------------------+--------------------+------------------+-------------------+------------+
|    country0|adjusted_satisfaction|avg_satisfaction|std_satisfaction|        avg_income|     median_income| income_inequality|              region|        happyScore|                GDP|   country10|
+------------+---------------------+----------------+----------------+------------------+------------------+------------------+--------------------+------------------+-------------------+------------+
|     Armenia|                 37.0|             4.9|            2.42|2096.7599999999998|1731.5066666666667|31.445555555555554|'Central and East...|              4.35| 0.7682100000000001|     Armenia|
|      Angola|                 26.0|             4.3|            3.19|           1448.88|           1044.24|             42.72|'Sub-Saharan Africa'|             4.033|            0.75778|      Ang

In [6]:
df_main.printSchema()

root
 |-- country0: string (nullable = true)
 |-- adjusted_satisfaction: double (nullable = true)
 |-- avg_satisfaction: double (nullable = true)
 |-- std_satisfaction: double (nullable = true)
 |-- avg_income: double (nullable = true)
 |-- median_income: double (nullable = true)
 |-- income_inequality: double (nullable = true)
 |-- region: string (nullable = true)
 |-- happyScore: double (nullable = true)
 |-- GDP: double (nullable = true)
 |-- country10: string (nullable = true)



In [8]:
df_main.columns


['country0',
 'adjusted_satisfaction',
 'avg_satisfaction',
 'std_satisfaction',
 'avg_income',
 'median_income',
 'income_inequality',
 'region',
 'happyScore',
 'GDP',
 'country10']

In [11]:
for cols in df_main.columns:
    print(cols,':',df_main.filter(col(cols).isNull()).count())

country0 : 0
adjusted_satisfaction : 0
avg_satisfaction : 0
std_satisfaction : 0
avg_income : 0
median_income : 0
income_inequality : 0
region : 0
happyScore : 0
GDP : 0
country10 : 0


In [15]:
assembler = VectorAssembler(inputCols=['avg_income',
                                        'median_income',
                                        'income_inequality',
                                      'GDP'],
                           outputCol='Features')

assembled_df = assembler.transform(df_main)

In [17]:
model_df = assembled_df.select('Features','happyScore')

In [18]:
model_df.show()

+--------------------+------------------+
|            Features|        happyScore|
+--------------------+------------------+
|[2096.75999999999...|              4.35|
|[1448.88,1044.24,...|             4.033|
|[7101.12,5109.4,4...|             6.574|
|[19457.0399999999...|               7.2|
|[19917.0,15846.06...|             7.284|
|[3381.60000000000...| 5.212000000000001|
|[1265.34,994.1400...|             4.694|
|[17168.505,15166....|             6.937|
|[870.84,630.24,39...|3.5869999999999997|
|[5354.82,4523.565...|             4.218|
|[572.88,436.92,33...|             2.905|
|[989.04,657.0,43....|              3.34|
|[3985.71000000000...|              5.89|
|[5567.235,3294.18...|             6.983|
|[3484.68,1632.6,6...|             4.332|
|[5453.93333333333...|             5.813|
|[20190.78,16829.1...|7.4270000000000005|
|[23400.0399999999...| 7.587000000000001|
|[7557.99,4448.01,...|              6.67|
|[1490.52,1030.08,...|             4.252|
+--------------------+------------

In [19]:
train_df , test_df = model_df.randomSplit(weights=[0.7,0.3],seed=10)

### Linear Regression

In [22]:
from pyspark.ml.regression import LinearRegression 


In [23]:
leanier_regression = LinearRegression(featuresCol='Features',labelCol='happyScore')
lr_model = leanier_regression.fit(train_df)
lr_train_preds = lr_model.transform(train_df)
lr_test_preds = lr_model.transform(test_df)

In [26]:
lr_train_preds.show(3)

+--------------------+-----------------+------------------+
|            Features|       happyScore|        prediction|
+--------------------+-----------------+------------------+
|[572.88,436.92,33...|            2.905|3.8147499443456723|
|[653.04,528.72,36...|4.571000000000001|3.8996136501573195|
|[718.4,535.560000...|            3.845| 3.926014540305673|
+--------------------+-----------------+------------------+
only showing top 3 rows



In [27]:
lr_test_preds.show(3)

+--------------------+------------------+-----------------+
|            Features|        happyScore|       prediction|
+--------------------+------------------+-----------------+
|[574.199999999999...|             3.681|4.091648222215715|
|[714.72,488.52,45...|             4.971|4.003628579567792|
|[870.84,630.24,39...|3.5869999999999997|4.182741073424308|
+--------------------+------------------+-----------------+
only showing top 3 rows



### Evaluation

In [31]:
from pyspark.ml.evaluation import RegressionEvaluator

In [37]:
rsme_evaluator = RegressionEvaluator(labelCol='happyScore',predictionCol='prediction',metricName='rmse')
mae_evaluator = RegressionEvaluator(labelCol='happyScore',predictionCol='prediction',metricName='mae')
r2_evaluator  = RegressionEvaluator(labelCol='happyScore',predictionCol='prediction',metricName='r2')




print('rsme :',rsme_evaluator.evaluate(lr_test_preds))
print('mae :',mae_evaluator.evaluate(lr_test_preds))
print('r2 :',r2_evaluator.evaluate(lr_test_preds))
lr_test_preds.groupBy('prediction','happyScore').count().show()

rsme : 0.7373838122832916
mae : 0.6389550444184331
r2 : 0.6215657008664855
+------------------+------------------+-----+
|        prediction|        happyScore|count|
+------------------+------------------+-----+
| 4.590932867264556|             5.129|    1|
| 6.455042625698127|             7.278|    1|
| 4.785057479276185|              5.36|    1|
| 5.421830409782979|             4.218|    1|
|6.9250616357472055| 6.867000000000001|    1|
| 4.182741073424308|3.5869999999999997|    1|
| 6.428894641471455|              6.67|    1|
| 5.569087385477885|             4.686|    1|
| 5.352405323169796|             5.813|    1|
| 5.335294389125309|             5.855|    1|
| 4.626592740438295|             4.633|    1|
| 5.505314949437785|             6.455|    1|
| 8.007080369902576| 6.946000000000001|    1|
| 4.173539521838036|             2.839|    1|
|  5.50326592914689| 5.877999999999999|    1|
|  5.85981968190919| 5.428999999999999|    1|
| 6.120887157323307|6.7860000000000005|    1|
| 6.6