In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('cruise').getOrCreate()

In [3]:
data = spark.read.csv(r"C:\Users\Asus\Desktop\studia big data\Python-and-Spark-for-Big-Data-master\Spark_for_Machine_Learning\Linear_Regression\cruise_ship_info.csv", inferSchema= True, header = True)

In [4]:
data.printSchema

<bound method DataFrame.printSchema of DataFrame[Ship_name: string, Cruise_line: string, Age: int, Tonnage: double, passengers: double, length: double, cabins: double, passenger_density: double, crew: double]>

In [5]:
data.columns

['Ship_name',
 'Cruise_line',
 'Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew']

In [6]:
data_pandas = data.select('Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew')

In [7]:
data_pandas1 = data_pandas.toPandas()

In [8]:
from pyspark.ml.feature import VectorAssembler

In [9]:
import statsmodels.formula.api as smf

In [10]:
def forward_selection(data, response):
    remaining = set(data.columns)
    remaining.remove(response)
    selected = []
    current_score, best_new_score = float('inf'), float('inf')
    while remaining and current_score == best_new_score:
        scores_with_candidates = []
        for candidate in remaining:
            formula = "{} ~ {}".format(response, ' + '.join(selected + [candidate]))
            score = smf.ols(formula, data).fit().aic
            scores_with_candidates.append((score, candidate))
        scores_with_candidates.sort()
        best_new_score, best_candidate = scores_with_candidates.pop(0)
        if current_score > best_new_score:
            remaining.remove(best_candidate)
            selected.append(best_candidate)
            current_score = best_new_score
    formula = "{} ~ {}".format(response, ' + '.join(selected))
    model = smf.ols(formula, data).fit()
    return model, selected
model, selected_columns = forward_selection(data_pandas1, 'crew')

In [11]:
model.summary()

0,1,2,3
Dep. Variable:,crew,R-squared:,0.924
Model:,OLS,Adj. R-squared:,0.922
Method:,Least Squares,F-statistic:,465.3
Date:,"Mon, 15 Apr 2024",Prob (F-statistic):,1.67e-84
Time:,14:34:10,Log-Likelihood:,-218.16
No. Observations:,158,AIC:,446.3
Df Residuals:,153,BIC:,461.6
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.8187,0.585,-1.400,0.164,-1.974,0.337
cabins,0.7908,0.087,9.079,0.000,0.619,0.963
length,0.3976,0.113,3.503,0.001,0.173,0.622
passengers,-0.1499,0.039,-3.863,0.000,-0.226,-0.073
Tonnage,0.0163,0.008,2.010,0.046,0.000,0.032

0,1,2,3
Omnibus:,140.703,Durbin-Watson:,1.829
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2601.251
Skew:,3.098,Prob(JB):,0.0
Kurtosis:,21.888,Cond. No.,642.0


In [12]:
assembler = VectorAssembler(inputCols=['Age',
 'Tonnage',
 'passengers',
 'length',
 'cabins',
 'passenger_density',
 'crew'], outputCol='features')

In [13]:
op = assembler.transform(data)

In [14]:
model = op.select('features', 'crew')

In [15]:
from pyspark.ml.regression import LinearRegression

In [16]:
lr = LinearRegression(labelCol='crew', featuresCol='features')

In [17]:
train_data, test_data = model.randomSplit([0.7, 0.3])

In [18]:
model_lr = lr.fit(train_data)

In [19]:
eva = model_lr.transform(test_data)

In [21]:
eva.show()

+--------------------+-----+------------------+
|            features| crew|        prediction|
+--------------------+-----+------------------+
|[4.0,220.0,54.0,1...| 21.0|              21.0|
|[5.0,160.0,36.34,...| 13.6|13.600000000000014|
|[6.0,110.23899999...| 11.5|11.499999999999996|
|[6.0,112.0,38.0,9...| 10.9|10.900000000000002|
|[8.0,77.499,19.5,...|  9.0| 8.999999999999998|
|[9.0,59.058,17.0,...|  7.4|               7.4|
|[9.0,85.0,19.68,9...| 8.69| 8.690000000000005|
|[9.0,90.09,25.01,...| 8.69| 8.690000000000005|
|[9.0,113.0,26.74,...|12.38|12.379999999999995|
|[10.0,68.0,10.8,7...| 6.36| 6.360000000000001|
|[10.0,91.62700000...|  9.0| 9.000000000000005|
|[10.0,105.0,27.2,...|10.68|10.680000000000005|
|[10.0,110.0,29.74...| 11.6|11.600000000000005|
|[10.0,138.0,31.14...|11.85|11.850000000000012|
|[11.0,91.0,20.32,...| 9.99| 9.989999999999997|
|[11.0,91.62700000...|  9.0| 9.000000000000005|
|[11.0,108.977,26....| 12.0|11.999999999999995|
|[12.0,50.0,7.0,7....| 4.45| 4.450000000