In [1]:
from pyspark.sql import SparkSession  #객체 생성
spark= SparkSession.builder.appName("taxi-fare-prediction").getOrCreate()

In [4]:
import os
cwd=os.getcwd()
trip_data_path=os.path.join(cwd, 'learning_spark_data', 'trips', '*.csv') #learning_spark_data/trips 폴더 아래의 모든 .csv 파일 경로를 만듦
trip_data_path  #여러 csv 파일 한번에 읽기 준비

'/home/jovyan/work/learning_spark_data/trips/*.csv'

In [6]:
file_path=f"file:///{trip_data_path.replace(os.sep,'/')}"
file_path

# Spark에서 사용할 수 있는 파일 시스템 경로로 변환

'file:////home/jovyan/work/learning_spark_data/trips/*.csv'

In [13]:
trip_df=spark.read.csv(file_path,inferSchema=True, header=True) 
trip_df.printSchema()

#inferSchema=True: 데이터 타입을 자동으로 추론
#header=True: 첫 번째 줄을 컬럼명으로 인식
#printSchema(): 읽어온 DataFrame의 컬럼명, 타입 구조를 출력



root
 |-- VendorID: integer (nullable = true)
 |-- tpep_pickup_datetime: timestamp (nullable = true)
 |-- tpep_dropoff_datetime: timestamp (nullable = true)
 |-- passenger_count: integer (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- RatecodeID: integer (nullable = true)
 |-- store_and_fwd_flag: string (nullable = true)
 |-- PULocationID: integer (nullable = true)
 |-- DOLocationID: integer (nullable = true)
 |-- payment_type: integer (nullable = true)
 |-- fare_amount: double (nullable = true)
 |-- extra: double (nullable = true)
 |-- mta_tax: double (nullable = true)
 |-- tip_amount: double (nullable = true)
 |-- tolls_amount: double (nullable = true)
 |-- improvement_surcharge: double (nullable = true)
 |-- total_amount: double (nullable = true)
 |-- congestion_surcharge: double (nullable = true)



In [None]:
#운행거리에 따른 요금 예측

In [14]:
trip_df.createOrReplaceTempView('trips')

#PySpark DataFrame을 'trips'라는 SQL 테이블처럼 쓸 수 있게 임시 뷰로 등록

In [15]:
query="""
SELECT 
    trip_distance,
    total_amount
FROM trips

WHERE total_amount<5000
AND total_amount>0
AND trip_distance>0
AND trip_distance<500
AND passenger_count<4  
AND TO_DATE(tpep_pickup_datetime)>="2021-01-01"
AND TO_DATE(tpep_pickup_datetime)<"2021-08-01"
"""
#SQL로 데이터 전처리(필터링) 수행
#total_amount: 0~5000 사이 (이상치, 오류값 제거)
#trip_distance: 0~500 사이 (비정상값 제거)
#passenger_count: 4 미만 (일반적인 소형 택시)

In [16]:
trip_df=spark.sql(query)
trip_df.createOrReplaceTempView('data')

In [17]:
spark.sql('select * from data limit 5').show()

+-------------+------------+
|trip_distance|total_amount|
+-------------+------------+
|         16.5|       70.07|
|         1.13|       11.16|
|         2.68|       18.59|
|         12.4|        43.8|
|          9.7|        32.3|
+-------------+------------+



In [24]:
#train, test split 8:2, seed=1 
train_df,test_df=trip_df.randomSplit([0.8,0.2],seed=1)

In [27]:
#vectorassembler>features: trip_distance,target: total_amount
from pyspark.ml.feature import VectorAssembler

vassembler = VectorAssembler(
    inputCols=['trip_distance'],    # 독립변수(피처) 리스트
    outputCol='features'            # ML용 feature vector 컬럼명(관례적으로 'features')
)

vtrain_df=vassembler.transform(train_df)  #학습 데이터에서 벡터어셈블링 적용, 결과:기존 컬럼 + 'features' 컬럼 추가
vtrain_df.show(5)


+-------------+------------+--------+
|trip_distance|total_amount|features|
+-------------+------------+--------+
|         0.01|        3.05|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
|         0.01|         3.3|  [0.01]|
+-------------+------------+--------+
only showing top 5 rows



In [29]:
#linearRegression 생성 maxIter=50, labelCol='total_amount',featureCol='features'
#fit
#vassembler.fransfrom(test)
#model.transfrom

from pyspark.ml.regression import LinearRegression

lr=LinearRegression(
    maxIter=50,
    featuresCol='features',   
    labelCol='total_amount'     #목표가 labelcol ;요금
)

lr_model=lr.fit(vtrain_df) #학습 데이터로 선형회귀 모델학습

vtest_df=vassembler.transform(test_df) #test데이터도 똑같이 features 벡터컬럼추가
pred=lr_model.transform(vtest_df) #테스트셋의 각 row에 대해 예측값 컬럼이 추가된 DataFrame(pred) 생성


In [30]:
pred.show(5)

+-------------+------------+--------+-----------------+
|trip_distance|total_amount|features|       prediction|
+-------------+------------+--------+-----------------+
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
|         0.01|         3.3|  [0.01]|9.430440745312902|
+-------------+------------+--------+-----------------+
only showing top 5 rows



In [31]:
lr_model.summary.rootMeanSquaredError #RMSE

6.30781413196623

In [32]:
lr_model.summary.r2 

0.7648633777017714

In [None]:
# 새로운 데이터로 예측하기

In [36]:
from pyspark.sql.types import DoubleType

new_distance_list=[1.1, 5.4, 10.2, 30.0]
distance_df=spark.createDataFrame(new_distance_list,DoubleType()).toDF('trip_distance')
distance_df.show()

+-------------+
|trip_distance|
+-------------+
|          1.1|
|          5.4|
|         10.2|
|         30.0|
+-------------+



In [37]:
vdistance_df = vassembler.transform(distance_df)
lr_model.transform(vdistance_df).show()

+-------------+--------+------------------+
|trip_distance|features|        prediction|
+-------------+--------+------------------+
|          1.1|   [1.1]|12.672809485363317|
|          5.4|   [5.4]|25.463805432351194|
|         10.2|  [10.2]| 39.74212648945393|
|         30.0|  [30.0]| 98.64020085000274|
+-------------+--------+------------------+



In [39]:
spark.stop()