In [1]:
import pandas as pd
import numpy as np
import folium
import matplotlib.pyplot as plt
from sklearn.cluster import DBSCAN
from st_dbscan import ST_DBSCAN
from coordinates import convert_to_utm

pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

df = pd.read_csv("Clean_Data")

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1446292 entries, 0 to 1446291
Data columns (total 6 columns):
 #   Column        Non-Null Count    Dtype  
---  ------        --------------    -----  
 0   Taxi ID       1446292 non-null  int64  
 1   Timestamp     1446292 non-null  object 
 2   Speed (km/h)  1446292 non-null  int64  
 3   Distance (m)  1446292 non-null  int64  
 4   Longitude     1446292 non-null  float64
 5   Latitude      1446292 non-null  float64
dtypes: float64(2), int64(3), object(1)
memory usage: 66.2+ MB


In [3]:
df.head()

Unnamed: 0,Taxi ID,Timestamp,Speed (km/h),Distance (m),Longitude,Latitude
0,320,2019-01-01 00:00:03.260905+03,48,54,28.95837,40.23293
1,320,2019-01-01 00:00:07.224554+03,44,48,28.95791,40.23328
2,320,2019-01-01 00:00:11.240787+03,36,39,28.9575,40.23359
3,320,2019-01-01 00:00:15.22731+03,37,41,28.95716,40.23384
4,320,2019-01-01 00:00:19.231051+03,47,52,28.9568,40.2341


In [4]:
df['Timestamp'] = pd.to_datetime(df['Timestamp'])

# Extract the minute part from 'Timestamp'
df['Timestamp'] = df['Timestamp'].dt.floor('T')

In [5]:
def get_middle_row(group):
    middle_index = group.index[len(group) // 2]
    return group.loc[middle_index]

# Group by 'Timestamp' and apply the custom function
grouped_df = df.groupby(["Taxi ID", "Timestamp"]).apply(get_middle_row)

In [6]:
grouped_df = grouped_df.rename_axis(['Taxi_ID_Index', 'Timestamp_Index'])
df_reset = grouped_df.reset_index()
df_reset = df_reset.drop(['Taxi_ID_Index', 'Timestamp_Index'], axis=1)


In [7]:
reduced_df = df_reset[["Timestamp","Longitude", "Latitude"]]
reduced_df2 = reduced_df.copy()

In [8]:
convert_time = convert_to_utm(reduced_df, src_epsg=4326, dst_epsg=32633,
                        col_lat='Latitude', col_lon='Longitude')

Formal definition string for the old projection: proj=longlat datum=WGS84 no_defs ellps=WGS84 towgs84=0,0,0
Formal definition string for the new projection: proj=utm zone=33 datum=WGS84 units=m no_defs ellps=WGS84 towgs84=0,0,0


  x2, y2 = pyproj.transform(old_proj, new_proj, x1, y1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[alias_lon] = x2
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[alias_lat] = y2


In [9]:
convert_time.head()

Unnamed: 0,Timestamp,Longitude,Latitude
0,2019-01-01 00:00:00+03:00,3001412.0,3481096.0
1,2019-01-01 00:01:00+03:00,3001676.0,3480843.0
2,2019-01-01 00:02:00+03:00,3002225.0,3480731.0
3,2019-01-01 00:03:00+03:00,3002212.0,3480716.0
4,2019-01-01 00:07:00+03:00,3002207.0,3480856.0


In [10]:
from sklearn.preprocessing import StandardScaler

we_dont_need_time_ = convert_time.drop(["Timestamp"], axis=1)
scaled_data = StandardScaler().fit_transform(we_dont_need_time_)

scaled_data = pd.DataFrame(scaled_data, columns=["Longitude", "Latitude"], index=convert_time.index)
a = reduced_df2["Timestamp"]
b = scaled_data

brand_new = pd.concat([a, b], axis=1)
# data = reduced_df[:, ]

brand_new['Timestamp'] = pd.to_datetime(brand_new['Timestamp'], format='%Y-%m-%d %H:%M:%S.%f%z')

# Convert the Pandas datetime objects to int64 representation
brand_new['Timestamp'] = brand_new['Timestamp'].astype('int64') / 10**9  # Convert nanoseconds to seconds

In [11]:
st_dbscan3 = ST_DBSCAN(eps1 = 0.01, eps2 = 300, min_samples = 6)
st_dbscan3.fit(brand_new) 


<st_dbscan.ST_DBSCAN at 0x21141bb6e10>

In [12]:
st_dbscan6 = ST_DBSCAN(eps1 = 0.02, eps2 = 300, min_samples = 6)
st_dbscan6.fit(brand_new) 


<st_dbscan.ST_DBSCAN at 0x2113b705f10>

In [13]:
st_dbscan9 = ST_DBSCAN(eps1 = 0.05, eps2 = 300, min_samples = 6) 
st_dbscan9.fit(brand_new) 


<st_dbscan.ST_DBSCAN at 0x21159ac59d0>

In [14]:
st_dbscan12 = ST_DBSCAN(eps1 = 0.1, eps2 = 300, min_samples = 6)
st_dbscan12.fit(brand_new) 

<st_dbscan.ST_DBSCAN at 0x2113b8e2410>

In [15]:
value3 = pd.DataFrame(st_dbscan3.labels, index=reduced_df.index, columns=["Label3"])
value6 = pd.DataFrame(st_dbscan6.labels, index=reduced_df.index, columns=["Label6"])
value9 = pd.DataFrame(st_dbscan9.labels, index=reduced_df.index, columns=["Label9"])
value12 = pd.DataFrame(st_dbscan12.labels, index=reduced_df.index, columns=["Label12"])

sth_new = pd.concat([reduced_df2, value3, value6, value9, value12], axis=1)

label_counts = sth_new['Label3'].value_counts()
mask = (sth_new['Label3'] != -1) & (sth_new['Label3'].map(label_counts) >= 50)
sth_new.loc[mask, 'Label3'] = 'Very Dense'

label_counts = sth_new['Label6'].value_counts()
mask = (sth_new['Label6'] != -1) & (sth_new['Label6'].map(label_counts) >= 50)
sth_new.loc[mask, 'Label6'] = 'Dense'

label_counts = sth_new['Label9'].value_counts()
mask = (sth_new['Label9'] != -1) & (sth_new['Label9'].map(label_counts) >= 50)
sth_new.loc[mask, 'Label9'] = 'Moderate'

label_counts = sth_new['Label12'].value_counts()
mask = (sth_new['Label12'] != -1) & (sth_new['Label12'].map(label_counts) >= 50)
sth_new.loc[mask, 'Label12'] = 'Low Traffic'


In [16]:
conditions = [
    sth_new.apply(lambda row: 'Very Dense' in row.values, axis=1),
    sth_new.apply(lambda row: 'Dense' in row.values, axis=1),
    sth_new.apply(lambda row: 'Moderate' in row.values, axis=1),
    sth_new.apply(lambda row: 'Low Traffic' in row.values, axis=1)
]

# Define the label for each condition
labels = ['Very Dense', 'Dense', 'Moderate', 'Low Traffic']

# Use np.select to apply these conditions and labels to the DataFrame
sth_new['Traffic_Label'] = np.select(conditions, labels, default='No Traffic')

In [17]:
sth_new.drop(["Label3", "Label6", "Label9", "Label12"], axis=1, inplace=True)

In [18]:
sth_new.head()

Unnamed: 0,Timestamp,Longitude,Latitude,Traffic_Label
0,2019-01-01 00:00:00+03:00,28.95318,40.23682,No Traffic
1,2019-01-01 00:01:00+03:00,28.95063,40.23873,No Traffic
2,2019-01-01 00:02:00+03:00,28.94869,40.24359,No Traffic
3,2019-01-01 00:03:00+03:00,28.94859,40.24344,Low Traffic
4,2019-01-01 00:07:00+03:00,28.94974,40.24369,Low Traffic


In [21]:
sth_new.to_csv("Label_Data", index=False)

In [19]:
sth_new.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109393 entries, 0 to 109392
Data columns (total 4 columns):
 #   Column         Non-Null Count   Dtype                    
---  ------         --------------   -----                    
 0   Timestamp      109393 non-null  datetime64[ns, UTC+03:00]
 1   Longitude      109393 non-null  float64                  
 2   Latitude       109393 non-null  float64                  
 3   Traffic_Label  109393 non-null  object                   
dtypes: datetime64[ns, UTC+03:00](1), float64(2), object(1)
memory usage: 3.3+ MB


In [20]:
import plotly.express as px
import plotly.graph_objects as go


color_discrete_map = {
    'No Traffic': '#22CE83',
    'Low Traffic': '#FFFF33',
    'Moderate': '#FFCE44',
    'Dense': '#FF5F1F',
    'Very Dense': '#C11B17'

}

fig = go.Figure()

# Add scatter plots for each category in the desired order
for traffic_label in ['No Traffic', 'Low Traffic', 'Moderate', 'Dense', 'Very Dense']:
    filtered_data = sth_new[sth_new['Traffic_Label'] == traffic_label]
    fig.add_trace(go.Scattermapbox(
        lat=filtered_data['Latitude'],
        lon=filtered_data['Longitude'],
        mode='markers',
        marker=go.scattermapbox.Marker(
            size=5,
            color=color_discrete_map[traffic_label],
            opacity=0.7
        ),
        name=traffic_label
    ))

# Update layout
fig.update_layout(
    mapbox_style="streets",
    mapbox=dict(zoom=10, accesstoken="pk.eyJ1IjoieXNybWhtdGNsayIsImEiOiJjbHBuenJsbTcwbWJ1MmlwamxzMjRzOHo3In0.Mg3VHduKHGIe7B92ODMHzQ",
                center=dict(lat=40.1828, lon=29.0667)),
    showlegend=True
)

# Show the figure
fig.show()