### Regression 비교

* pyspark 이용

In [None]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()

In [None]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.regression import LinearRegression

In [None]:
dataset = spark.read.csv('BostonHousing.csv',inferSchema=True, header =True)

In [None]:
dataset.printSchema()

root
 |-- crim: double (nullable = true)
 |-- zn: double (nullable = true)
 |-- indus: double (nullable = true)
 |-- chas: integer (nullable = true)
 |-- nox: double (nullable = true)
 |-- rm: double (nullable = true)
 |-- age: double (nullable = true)
 |-- dis: double (nullable = true)
 |-- rad: integer (nullable = true)
 |-- tax: integer (nullable = true)
 |-- ptratio: double (nullable = true)
 |-- b: double (nullable = true)
 |-- lstat: double (nullable = true)
 |-- medv: double (nullable = true)



In [None]:
#Input all the features in one vector column
assembler = VectorAssembler(inputCols=['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat'], outputCol = 'Attributes')

output = assembler.transform(dataset)

#Input vs Output
finalized_data = output.select("Attributes","medv")

finalized_data.show()

+--------------------+----+
|          Attributes|medv|
+--------------------+----+
|[0.00632,18.0,2.3...|24.0|
|[0.02731,0.0,7.07...|21.6|
|[0.02729,0.0,7.07...|34.7|
|[0.03237,0.0,2.18...|33.4|
|[0.06905,0.0,2.18...|36.2|
|[0.02985,0.0,2.18...|28.7|
|[0.08829,12.5,7.8...|22.9|
|[0.14455,12.5,7.8...|27.1|
|[0.21124,12.5,7.8...|16.5|
|[0.17004,12.5,7.8...|18.9|
|[0.22489,12.5,7.8...|15.0|
|[0.11747,12.5,7.8...|18.9|
|[0.09378,12.5,7.8...|21.7|
|[0.62976,0.0,8.14...|20.4|
|[0.63796,0.0,8.14...|18.2|
|[0.62739,0.0,8.14...|19.9|
|[1.05393,0.0,8.14...|23.1|
|[0.7842,0.0,8.14,...|17.5|
|[0.80271,0.0,8.14...|20.2|
|[0.7258,0.0,8.14,...|18.2|
+--------------------+----+
only showing top 20 rows



In [None]:
#Split training and testing data
train_data,test_data = finalized_data.randomSplit([0.8,0.2])


regressor = LinearRegression(featuresCol = 'Attributes', labelCol = 'medv')

#Learn to fit the model from training set
regressor = regressor.fit(train_data)

#To predict the prices on testing set
pred = regressor.evaluate(test_data)

#Predict the model
pred.predictions.show()

+--------------------+----+------------------+
|          Attributes|medv|        prediction|
+--------------------+----+------------------+
|[0.00906,90.0,2.9...|32.2|31.415705899406454|
|[0.03427,0.0,5.19...|19.5|20.603865195723916|
|[0.0351,95.0,2.68...|48.5|41.548991160521624|
|[0.03548,80.0,3.6...|20.9| 21.91576911001266|
|[0.03768,80.0,1.5...|34.6|34.383204565335824|
|[0.03932,0.0,3.41...|22.0|27.541413320845876|
|[0.04113,25.0,4.8...|28.0|28.477880149263303|
|[0.04544,0.0,3.24...|19.8|21.795837153677322|
|[0.0456,0.0,13.89...|23.3|26.054960434061307|
|[0.0459,52.5,5.32...|22.3|27.245409642101517|
|[0.05644,40.0,6.4...|32.4| 35.47184338431088|
|[0.05735,0.0,4.49...|26.6|27.923774620312745|
|[0.06129,20.0,3.3...|46.0| 39.54411979880592|
|[0.06151,0.0,5.19...|18.7| 21.80584440873047|
|[0.06664,0.0,4.05...|29.4|30.802540286819568|
|[0.0686,0.0,2.89,...|33.2|32.154411709098405|
|[0.06905,0.0,2.18...|36.2|28.071431902030373|
|[0.06911,45.0,3.4...|30.5|30.159815144224222|
|[0.07151,0.0

In [None]:
#coefficient of the regression model
coeff = regressor.coefficients

#X and Y intercept
intr = regressor.intercept

print ("The coefficient of the model is : %a" %coeff)
print ("The Intercept of the model is : %f" %intr)

The coefficient of the model is : DenseVector([-0.0717, 0.0433, -0.0076, 2.1433, -17.7221, 3.5495, -0.0015, -1.4504, 0.3026, -0.0131, -0.9234, 0.0099, -0.5173])
The Intercept of the model is : 37.804253


In [None]:
%%time
dataset = spark.read.csv('BostonHousing.csv',inferSchema=True, header =True)
assembler = VectorAssembler(inputCols=['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax', 'ptratio', 'b', 'lstat'], outputCol = 'Attributes')
output = assembler.transform(dataset)
finalized_data = output.select("Attributes","medv")

train_data,test_data = finalized_data.randomSplit([0.8,0.2])
regressor = LinearRegression(featuresCol = 'Attributes', labelCol = 'medv')
regressor = regressor.fit(train_data)

pred = regressor.evaluate(test_data)
pred.predictions.show()

+--------------------+----+------------------+
|          Attributes|medv|        prediction|
+--------------------+----+------------------+
|[0.00906,90.0,2.9...|32.2|31.497716888578577|
|[0.0136,75.0,4.0,...|18.9| 14.43377657517383|
|[0.01501,80.0,2.0...|24.5| 27.72969736965767|
|[0.02177,82.5,2.0...|42.3| 37.15104705800279|
|[0.02763,75.0,2.9...|30.8|31.300634515854355|
|[0.02875,28.0,15....|25.0| 29.16446270982506|
|[0.0315,95.0,1.47...|34.9|29.959567993609966|
|[0.03359,75.0,2.9...|34.9| 34.27012136855967|
|[0.03584,80.0,3.3...|23.5| 30.37312896750382|
|[0.03705,20.0,3.3...|35.4|34.723568098221534|
|[0.03738,0.0,5.19...|20.7|21.662935827412543|
|[0.03871,52.5,5.3...|23.2|27.234512447951218|
|[0.03961,0.0,5.19...|21.1|20.637905692138418|
|[0.04301,80.0,1.9...|18.2|13.943064854566039|
|[0.04337,21.0,5.6...|20.5|24.075456546399604|
|[0.04379,80.0,3.3...|19.4|25.308112531037022|
|[0.05602,0.0,2.46...|50.0| 36.00751571046535|
|[0.06047,0.0,2.46...|29.6|24.426384412065953|
|[0.06127,40.

* 기본 파이썬 이용

In [None]:
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm

In [None]:
dataset = pd.read_csv('BostonHousing.csv')

In [None]:
dataset.columns

Index(['crim', 'zn', 'indus', 'chas', 'nox', 'rm', 'age', 'dis', 'rad', 'tax',
       'ptratio', 'b', 'lstat', 'medv'],
      dtype='object')

In [None]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(dataset, test_size=0.2, random_state=0)

In [None]:
x_test = test.drop(['medv'], axis=1)
y_test = test['medv']

In [None]:
model = smf.ols(formula ='medv~ crim+zn+chas+nox+rm+age+dis+rad+tax+ptratio+b+lstat',data = train)
result=model.fit()

In [None]:
result.summary()

0,1,2,3
Dep. Variable:,medv,R-squared:,0.773
Model:,OLS,Adj. R-squared:,0.766
Method:,Least Squares,F-statistic:,111.0
Date:,"Wed, 05 May 2021",Prob (F-statistic):,8.98e-118
Time:,17:22:45,Log-Likelihood:,-1171.5
No. Observations:,404,AIC:,2369.0
Df Residuals:,391,BIC:,2421.0
Df Model:,12,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,38.0608,5.504,6.915,0.000,27.240,48.882
crim,-0.1196,0.037,-3.267,0.001,-0.192,-0.048
zn,0.0447,0.014,3.110,0.002,0.016,0.073
chas,2.3477,0.897,2.616,0.009,0.583,4.112
nox,-16.0297,4.064,-3.944,0.000,-24.020,-8.039
rm,3.7051,0.455,8.142,0.000,2.810,4.600
age,-0.0031,0.014,-0.220,0.826,-0.031,0.025
dis,-1.3905,0.208,-6.671,0.000,-1.800,-0.981
rad,0.2423,0.066,3.646,0.000,0.112,0.373

0,1,2,3
Omnibus:,141.572,Durbin-Watson:,1.997
Prob(Omnibus):,0.0,Jarque-Bera (JB):,630.874
Skew:,1.47,Prob(JB):,1.0199999999999999e-137
Kurtosis:,8.37,Cond. No.,15400.0


In [None]:
pred = result.predict(x_test)
test["pred"] = pred
test

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv,pred
329,0.06724,0.0,3.24,0,0.460,6.333,17.2,5.2146,4,430,16.9,375.21,7.34,22.6,24.929923
371,9.23230,0.0,18.10,0,0.631,6.216,100.0,1.1691,24,666,20.2,366.15,9.53,50.0,23.715148
219,0.11425,0.0,13.89,1,0.550,6.373,92.4,3.3633,5,276,16.4,393.74,10.50,23.0,29.342100
403,24.80170,0.0,18.10,0,0.693,5.349,96.0,1.7028,24,666,20.2,396.90,19.77,8.3,12.124717
78,0.05646,0.0,12.83,0,0.437,6.232,53.7,5.0141,5,398,18.7,386.40,12.34,21.2,21.427854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,0.02055,85.0,0.74,0,0.410,6.383,35.7,9.1876,2,313,17.3,396.90,5.77,24.7,25.451330
455,4.75237,0.0,18.10,0,0.713,6.525,86.5,2.4358,24,666,20.2,50.92,18.13,14.1,15.573295
60,0.14932,25.0,5.13,0,0.453,5.741,66.2,7.2254,8,284,19.7,395.11,13.15,18.7,17.933417
213,0.14052,0.0,10.59,0,0.489,6.375,32.3,3.9454,4,277,18.6,385.81,9.38,28.1,25.293344


In [None]:
%%time

dataset = pd.read_csv('BostonHousing.csv')
train, test = train_test_split(dataset, test_size=0.2, random_state=0)
x_test = test.drop(['medv'], axis=1)
y_test = test['medv']
model = smf.ols(formula ='medv~ crim+zn+chas+nox+rm+age+dis+rad+tax+ptratio+b+lstat',data = train)
result=model.fit()
pred = result.predict(x_test)
test["pred"] = pred
test

Wall time: 60 ms


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv,pred
329,0.06724,0.0,3.24,0,0.460,6.333,17.2,5.2146,4,430,16.9,375.21,7.34,22.6,24.929923
371,9.23230,0.0,18.10,0,0.631,6.216,100.0,1.1691,24,666,20.2,366.15,9.53,50.0,23.715148
219,0.11425,0.0,13.89,1,0.550,6.373,92.4,3.3633,5,276,16.4,393.74,10.50,23.0,29.342100
403,24.80170,0.0,18.10,0,0.693,5.349,96.0,1.7028,24,666,20.2,396.90,19.77,8.3,12.124717
78,0.05646,0.0,12.83,0,0.437,6.232,53.7,5.0141,5,398,18.7,386.40,12.34,21.2,21.427854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,0.02055,85.0,0.74,0,0.410,6.383,35.7,9.1876,2,313,17.3,396.90,5.77,24.7,25.451330
455,4.75237,0.0,18.10,0,0.713,6.525,86.5,2.4358,24,666,20.2,50.92,18.13,14.1,15.573295
60,0.14932,25.0,5.13,0,0.453,5.741,66.2,7.2254,8,284,19.7,395.11,13.15,18.7,17.933417
213,0.14052,0.0,10.59,0,0.489,6.375,32.3,3.9454,4,277,18.6,385.81,9.38,28.1,25.293344
