# Preprocessing

In [None]:
# import library yang diperlukan
import pandas as pd
from math import radians, sin, cos, sqrt, atan2
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
# baca dataset
df = pd.read_csv('dataset_bus.csv')
# mengambil data yang dikumpulkan pada hari minggu
# untuk mendapat data bus pada 1 rute pemberangkatan
bus_minggu = df[df['day'] == 'Minggu']
print(bus_minggu)

          lat         lng     day      time
0   -6.933546  107.716174  Minggu  09:54:54
1   -6.933546  107.716174  Minggu  09:54:59
2   -6.933546  107.716174  Minggu  09:55:04
3   -6.933322  107.716159  Minggu  09:55:09
4   -6.933322  107.716159  Minggu  09:55:15
..        ...         ...     ...       ...
839 -6.946155  107.595282  Minggu  11:08:37
840 -6.946155  107.595282  Minggu  11:08:43
841 -6.946155  107.595282  Minggu  11:08:48
842 -6.946155  107.595282  Minggu  11:08:53
843 -6.946155  107.595282  Minggu  11:08:58

[844 rows x 4 columns]


In [None]:
# import library yang diperlukan
import pandas as pd
from math import radians, sin, cos, sqrt, atan2
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
# baca dataset
df = pd.read_csv('dataset_bus.csv')
# mengambil data yang dikumpulkan pada hari minggu
# untuk mendapat data bus pada 1 rute pemberangkatan
bus_minggu = df[df['day'] == 'Minggu']
bus_minggu = bus_minggu.drop_duplicates(subset=['lat','lng'], keep='first')
bus_minggu = bus_minggu.reset_index(drop=True)
# Mengubah kolom string menjadi objek datetime
bus_minggu['time'] = pd.to_datetime(bus_minggu['time'], format="%H:%M:%S")
# Menghitung perbedaan waktu dan mengubahnya menjadi detik
bus_minggu['waktu'] = bus_minggu['time'].diff().dt.total_seconds()
bus_minggu['time'] = bus_minggu['time'].dt.time
print(bus_minggu)

          lat         lng     day      time  waktu
0   -6.933546  107.716174  Minggu  09:54:54    NaN
1   -6.933322  107.716159  Minggu  09:55:09   15.0
2   -6.932951  107.716010  Minggu  09:55:20   11.0
3   -6.931992  107.714652  Minggu  09:55:56   36.0
4   -6.931612  107.713275  Minggu  09:56:27   31.0
..        ...         ...     ...       ...    ...
153 -6.945234  107.590044  Minggu  11:05:59   56.0
154 -6.945661  107.589940  Minggu  11:06:50   51.0
155 -6.947000  107.593947  Minggu  11:07:36   46.0
156 -6.946676  107.595240  Minggu  11:08:17   41.0
157 -6.946155  107.595282  Minggu  11:08:27   10.0

[158 rows x 5 columns]


In [None]:
# menghitung jarak antar titik dan kecepatannya

bus_minggu['jarak'] = 0.0
bus_minggu['kecepatan'] = 0.0
# hitung jarak berdasarkan latitude dan longitude menggunakan haversine formula
for i in range(1, len(bus_minggu)):
    lat1, lon1 = radians(bus_minggu.at[i - 1, 'lat']), radians(bus_minggu.at[i - 1, 'lng'])
    lat2, lon2 = radians(bus_minggu.at[i, 'lat']), radians(bus_minggu.at[i, 'lng'])
    delta_lat = lat2 - lat1
    delta_lon = lon2 - lon1
    a = sin(delta_lat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(delta_lon / 2) ** 2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = 6371 * c  # Radius Bumi dalam kilometer
    waktu = bus_minggu.at[i, 'waktu'] / 3600
    # Menyimpan jarak
    bus_minggu.at[i, 'jarak'] = distance
    bus_minggu.at[i, 'kecepatan'] = distance / waktu
print(bus_minggu)


          lat         lng     day      time  waktu     jarak  kecepatan
0   -6.933546  107.716174  Minggu  09:54:54    NaN  0.000000   0.000000
1   -6.933322  107.716159  Minggu  09:55:09   15.0  0.024963   5.991032
2   -6.932951  107.716010  Minggu  09:55:20   11.0  0.044411  14.534508
3   -6.931992  107.714652  Minggu  09:55:56   36.0  0.183959  18.395884
4   -6.931612  107.713275  Minggu  09:56:27   31.0  0.157760  18.320531
..        ...         ...     ...       ...    ...       ...        ...
153 -6.945234  107.590044  Minggu  11:05:59   56.0  0.237011  15.236427
154 -6.945661  107.589940  Minggu  11:06:50   51.0  0.048848   3.448110
155 -6.947000  107.593947  Minggu  11:07:36   46.0  0.466676  36.522476
156 -6.946676  107.595240  Minggu  11:08:17   41.0  0.147197  12.924576
157 -6.946155  107.595282  Minggu  11:08:27   10.0  0.058118  20.922390

[158 rows x 7 columns]


In [None]:
from geopy.distance import geodesic
from shapely.geometry import Point, LineString
# Fungsi untuk menghitung jarak antar dua koordinat
def distance(coord1, coord2):
    return geodesic(coord1, coord2).meters
bus_minggu = bus_minggu.drop(['day','time'], axis=1)
bus_data = bus_minggu #rute bus
hambatan_data = pd.read_csv('hambatan.csv') #hambatan
# Radius untuk menentukan apakah hambatan berada dalam jalur bus (misalnya 50 meter)
radius = 5
for i in range(1, len(bus_data)):
    start_point = (bus_data.iloc[i - 1]['lat'], bus_data.iloc[i - 1]['lng'])
    end_point = (bus_data.iloc[i]['lat'], bus_data.iloc[i]['lng'])
    line = LineString([start_point, end_point])
    buffered_line = line.buffer(radius / 1000.0)  # Buffer in degrees (~km)
    hambatan_skor = 0
    for _, hambatan in hambatan_data.iterrows():
        hambatan_point = Point(hambatan['latitude'], hambatan['longitude'])
        if buffered_line.contains(hambatan_point):
            hambatan_skor += hambatan['skor_hambatan']
    bus_data.at[i,'skor_hambatan'] = hambatan_skor
print(bus_data)

          lat         lng  waktu     jarak  kecepatan  hambatan  skor_hambatan
0   -6.933546  107.716174    NaN  0.000000   0.000000       NaN            NaN
1   -6.933322  107.716159   15.0  0.024963   5.991032       2.2            2.2
2   -6.932951  107.716010   11.0  0.044411  14.534508       2.6            2.6
3   -6.931992  107.714652   36.0  0.183959  18.395884       2.6            2.6
4   -6.931612  107.713275   31.0  0.157760  18.320531       2.6            2.6
..        ...         ...    ...       ...        ...       ...            ...
153 -6.945234  107.590044   56.0  0.237011  15.236427      15.2           15.2
154 -6.945661  107.589940   51.0  0.048848   3.448110      14.4           14.4
155 -6.947000  107.593947   46.0  0.466676  36.522476      20.0           20.0
156 -6.946676  107.595240   41.0  0.147197  12.924576      20.0           20.0
157 -6.946155  107.595282   10.0  0.058118  20.922390       8.2            8.2

[158 rows x 7 columns]


In [None]:
bus_data.to_csv('data_baru.csv', index=False)

# Data Augmented

In [None]:
from geopy.distance import geodesic
from shapely.geometry import Point, LineString

import pandas as pd
import numpy as np
data = pd.read_csv('data_baru.csv')
halte = pd.read_csv('train1.csv')
halte = halte.drop(columns=['jarak','kecepatan', 'waktu', 'skor_hambatan'])
hambatan_data = pd.read_csv('hambatan.csv')

def nearest_point(titik, rute):
    min_distance = float('inf')
    nearest_point = None
    nearest_index = None
    for i, point in enumerate(rute):
        distance = np.linalg.norm(titik - point)
        if distance < min_distance:
            min_distance = distance
            nearest_index = i
            nearest_point = point
    return nearest_index

jarak = data['jarak'].values
kecepatan = data['kecepatan'].values
waktu = data['waktu'].values
skor_hambatan = data['skor_hambatan'].values
rute = data[['lat', 'lng']].values
titik = halte[['lat', 'lng']].values

halte['titik_terdekat'] = 0
halte['jarak_total'] = 0

for i in range(1, len(halte)):
  point = nearest_point(titik[i], rute)
  halte.at[i, 'titik_terdekat'] = point

titik_idx = halte['titik_terdekat'].values
for i in range(1, len(titik_idx)):
    jarak_total = 0
    kecepatan_rata = 0
    kecepatan_total = 0
    waktu_total = 0
    k = 0

    for j in range(titik_idx[i-1], titik_idx[i]):
        jarak_total += jarak[j]
        kecepatan_total += kecepatan[j]
        waktu_total += waktu[j]
        k += 1
    halte.at[i, 'jarak_total'] = jarak_total
    halte.at[i, 'waktu_total'] = waktu_total
    if k != 0:
      halte.at[i, 'kecepatan_total'] = kecepatan_total / k

results = []
for i in range(1, len(halte)):
    start_point = (halte.iloc[i - 1]['lat'], halte.iloc[i - 1]['lng'])
    end_point = (halte.iloc[i]['lat'], halte.iloc[i]['lng'])
    line = LineString([start_point, end_point])
    buffered_line = line.buffer(5 / 1000.0)

    hambatan_skor = 0

    for _, hambatan in hambatan_data.iterrows():
        hambatan_point = Point(hambatan['latitude'], hambatan['longitude'])
        if buffered_line.contains(hambatan_point):
            hambatan_skor += hambatan['skor_hambatan']

    # Simpan hasil untuk segmen ini
    results.append({
        'skor_hambatan': hambatan_skor
    })

    halte.at[i,'hambatan_total'] = hambatan_skor

print(halte)



        lat         lng  titik_terdekat  jarak_total  waktu_total  \
0 -6.933322  107.716159               0     0.000000          NaN   
1 -6.921177  107.619915             102    13.114767       2778.0   
2 -6.946155  107.595282             156     6.350630       1625.0   

   kecepatan_total  hambatan_total  
0              NaN             NaN  
1        17.312094            20.1  
2        13.779066            30.6  


In [None]:
halte = halte.rename(columns={'jarak_total': 'jarak', 'waktu_total': 'waktu', 'kecepatan_total': 'kecepatan', 'hambatan_total': 'hambatan'})
# Mengatur ulang urutan kolom menggunakan loc
halte = halte.loc[:, ['lat', 'lng','waktu', 'jarak', 'kecepatan', 'hambatan']]
print(halte)


        lat         lng   waktu      jarak  kecepatan  hambatan
0 -6.933322  107.716159     NaN   0.000000        NaN       NaN
1 -6.921177  107.619915  2778.0  13.114767  17.312094      20.1
2 -6.946155  107.595282  1625.0   6.350630  13.779066      30.6


In [None]:
halte.to_csv('data_sementara.csv', index=False)