# Data exploration 

### Note

This notebook uses the cleaned data of the folder `data_cleaned` which is created in the notebook data_to_database.ipynb. Thus, please first run that notebook

In [None]:
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import geopandas as gpd
import time
from shapely import wkt
import datetime
from shapely.geometry import Point
font = {'family' : 'Sans',
        'size'   : 15}
matplotlib.rc('font', **font)

#### Helper functions

In [None]:
def convert_to_timestamp(x):
    if pd.isna(x):
        return pd.NA
    else:
        return time.mktime(datetime.datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f").timetuple())

def diff_in_hours(ts1, ts2):
    if pd.isna(ts1) or pd.isna(ts2):
        return pd.NA
    return (convert_to_timestamp(ts1) - convert_to_timestamp(ts2)) / 3600 

def lon_lat_to_geom(data):
    geom_col = []
    for i, row in data.iterrows():
        geom = Point(row["LAT"], row["LON"])
        geom_col.append(geom)
    data["geom"] = geom_col
    data = gpd.GeoDataFrame(data, geometry="geom")
    return data

def write_geodataframe(gdf, out_path):
    geo_col_name = gdf.geometry.name
    df = pd.DataFrame(gdf, copy=True)
    df[geo_col_name] = gdf.geometry.apply(wkt.dumps)
    df.to_csv(out_path, index=True)
    
def read_geodataframe(in_path, geom_col="geom"):
    df = pd.read_csv(in_path)
    df[geom_col] = df[geom_col].apply(wkt.loads)
    gdf = gpd.GeoDataFrame(df, geometry=geom_col)
    return gdf

In [None]:
def get_weekday(time_string):
    if pd.isna(time_string):
        return pd.NA
    dt = datetime.datetime.strptime(time_string, "%Y-%m-%d %H:%M:%S.%f")
    return dt.weekday()

def get_hour(time_string):
    if pd.isna(time_string):
        return pd.NA
    dt = datetime.datetime.strptime(time_string, "%Y-%m-%d %H:%M:%S.%f")
    return dt.hour

In [None]:
reservation = pd.read_csv("data_cleaned/booking.csv", index_col="reservation_no")

### Pre filtering

In [None]:
print(len(reservation))
reservation = reservation[reservation["reservationtype"]=="Normal"]
reservation = reservation[reservation["duration_hours"]< 500]
reservation = reservation[reservation["reservationstate"]!="annulliert"]
len(reservation)

#### Check whether we need to exclude further reservations

In [None]:
reservation[reservation["duration_hours"]>168]

### Distribution over users

In [None]:
by_user =  reservation.groupby("person_no").agg({"person_no": "count"})

In [None]:
plt.hist(by_user["person_no"].values, bins=100)
plt.yscale("log")
plt.xlim(0, 750)
plt.xlabel("Number of trips")
plt.ylabel("Number of users")
plt.title("Histogram of trips per user")
plt.show()

### Start time and end time distributions

In [None]:
time_before = reservation.apply(lambda x: diff_in_hours(x["drive_firststart"], x["reservationfrom"]), axis=1)

In [None]:
time_before.dropna(inplace=True)

In [None]:
plt.hist(time_before[time_before < 2], bins=20)
# plt.yscale("log")
plt.xlim(-.5, 2)
plt.ylabel("Number of bookings")
plt.xlabel("Hours between reservation start and start of drive")
plt.show()

In [None]:
reservation.columns

In [None]:
# Investigate negative values
temp = reservation.loc[time_before.index]
negatives =  temp[time_before < 0]

In [None]:
len(negatives)

In [None]:
negatives[["reservationfrom", "drive_firststart"]].head(50)

In [None]:
time_after_orig = reservation.apply(lambda x: diff_in_hours(x["reservationto"], x["drive_lastend"]), axis=1)

In [None]:
# preprocess
print("before:", len(time_after_orig))
time_after = time_after_orig.dropna()
# only 5 bookings have a longer gap than 300
time_after = time_after[time_after<300]
"after", len(time_after)

In [None]:
time_after[time_after< 0]

In [None]:
between = time_after[time_after > -2]
between = between[between < 2]
plt.hist(between, bins=100)
plt.yscale("log")
plt.xlim(-2,2)
plt.ylabel("Number of bookings")
plt.xlabel("Hours between end of drive and reservation end")
plt.show()

In [None]:
sum(time_after< 0)

In [None]:
wrong_lastend = reservation.loc[time_after[time_after<-100].index]

In [None]:
wrong_lastend[["drive_lastend", "reservationto"]]

## Weekday and time distribution

In [None]:
for col in ["drive_firststart", "reservationfrom", "drive_lastend", "reservationto"]:
    reservation[col+"_weekday"] = reservation[col].apply(get_weekday)

In [None]:
for col in ["drive_firststart", "reservationfrom", "drive_lastend", "reservationto"]:
    sns.countplot(reservation[col+ "_weekday"])
    plt.xticks(np.arange(7), np.arange(7))
    plt.show()

### Daily time

In [None]:
for col in ["drive_firststart", "reservationfrom", "drive_lastend", "reservationto"]:
    reservation[col+"_hour"] = reservation[col].apply(get_hour)
    plt.figure(figsize=(15, 6))
    sns.countplot(reservation[col+ "_hour"].dropna())
    plt.xticks(np.arange(24), np.arange(24))
    plt.show()

#### Drive firststart looks weird, investigate more in detail:

In [None]:
# reason were the nan vlaues??
for col in ["drive_firststart", "drive_lastend"]:
    if col == "drive_lastend":
        reservation[col+"_hour"] = reservation[col].apply(get_hour)
    plt.figure(figsize=(15, 6))
    sns.countplot(reservation[col+ "_hour"].dropna())
    # Reason was just the wrong ordering of xticks in the first version
    plt.xticks(np.arange(24), np.arange(24))
    plt.show()

### Weekdays

In [None]:
weekday_cols = reservation["reservationfrom_weekday"]

In [None]:
# during the week
plt.figure(figsize=(8,4))
this_day = reservation[reservation["reservationfrom_weekday"] <5]
sns.countplot(this_day["reservationfrom_hour"])
plt.xticks(np.arange(24), np.arange(24))
plt.show()
# weekend
plt.figure(figsize=(8,4))
this_day = reservation[reservation["reservationfrom_weekday"] >4]
sns.countplot(this_day["reservationfrom_hour"])
plt.xticks(np.arange(24), np.arange(24))
plt.show()

In [None]:
# for drive first start
for day in range(7):
    plt.figure(figsize=(8,4))
    this_day = reservation[reservation["reservationfrom_weekday"] == day]
    sns.countplot(this_day["drive_firststart_hour"])
    plt.xticks(np.arange(24), np.arange(24))
    plt.show()

### Distribution of bookings over the year

In [None]:
def date_to_time(time_str):
    # This function was stupid
    if pd.isna(time_str):
        return pd.NA
    dt = datetime.datetime.strptime(time_str, "%Y-%m-%d %H:%M:%S.%f")
    return dt.year * 10000 + dt.month * 100 + dt.day

In [None]:
all_days = [d.split(" ")[0] for d in reservation["reservationfrom"].values if int(d[:4])>2018]

In [None]:
uni_days = np.unique(all_days)
mapping = {u: i for i, u in enumerate(uni_days)}

In [None]:
# check if we have all days:
last_day = 0
for d in uni_days:
    new_day = int(d.split("-")[-1])
    if last_day != new_day -1:
        print(d)
    last_day = new_day

In [None]:
# map
days_converted = reservation["reservationfrom"].apply(lambda x: mapping[x.split(" ")[0]] if int(x[:4])>2018 else pd.NA)

In [None]:
plt.figure(figsize=(20,8))
week_bins =  (np.arange(len(uni_days) / 7) * 7).astype(int)
plt.hist(days_converted.dropna(), bins=week_bins)
two_week_bins =  (np.arange(len(uni_days) / 14) * 14).astype(int)
plt.xticks(two_week_bins, uni_days[two_week_bins], rotation=90)
plt.show()

#### With distinction of energytype

In [None]:
reservation["reservation_from_day_converted"] = days_converted

In [None]:
reservation.columns

In [None]:
import seaborn as sns
f = plt.figure(figsize=(10,10))
ax = f.add_subplot(1,1,1)
sns.histplot(data=reservation, ax=ax, stat="count", multiple="stack",
             x="reservation_from_day_converted", kde=False,
             palette="pastel", hue="energytypegroup",
             element="bars", legend=True)
ax.set_title("Daily bookings by car type (Benzin / Electro)")
ax.set_xlabel("Bookings")
ax.set_ylabel("Count")


In [None]:
plt.figure(figsize=(20,8))
week_bins =  (np.arange(len(uni_days) / 7) * 7).astype(int)
plt.hist(days_converted.dropna(), bins=week_bins)
two_week_bins =  (np.arange(len(uni_days) / 14) * 14).astype(int)
plt.xticks(two_week_bins, uni_days[two_week_bins], rotation=90)
plt.show()

## By stations

In [None]:
reservation.columns

In [None]:
station = read_geodataframe("data_cleaned/station.csv")
station = station[station["lat"] > 0]
station = station[station["lon"] > 0]

In [None]:
plt.figure(figsize=(15,8))
plt.scatter(station["lon"].values, station["lat"].values, marker="o", c="orange")

In [None]:
bookings_per_station = reservation.groupby("start_station_no").agg({"start_station_no": "count"})

In [None]:
ev_bookings_per_station = reservation[reservation["energytypegroup"]=="Electro"].groupby("start_station_no").agg({"start_station_no": "count"})


In [None]:
plt.hist(bookings_per_station, bins=100)
plt.yscale("log")
plt.xlabel("Number of bookings")
plt.ylabel("Stations")
plt.show()

In [None]:
print(np.mean(bookings_per_station), np.std(bookings_per_station))

In [None]:
bookings_per_station.rename(columns={"start_station_no": "booking_count"}, inplace=True)

In [None]:
stations_w_bookings = bookings_per_station.merge(station, left_on="start_station_no", right_on="station_no").set_index("station_no")


In [None]:
ev_bookings_per_station= ev_bookings_per_station.rename(columns={"start_station_no":"booking_count"})

In [None]:
stations_w_ev_bookings = ev_bookings_per_station.merge(station, left_on="start_station_no", right_on="station_no").set_index("station_no")


In [None]:
# Test the result
sum(reservation["start_station_no"] == 1006)

In [None]:
# Try to plot on map
import pandas as pd
from shapely.geometry import Point
import geopandas as gpd
from geopandas import GeoDataFrame
#this is a simple map that goes with geopandas
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
station.plot(ax=world.plot(figsize=(10, 6)), marker='o', color='red', markersize=15);
plt.ylim(40,60)
plt.xlim(0, 15)

In [None]:
plt.figure(figsize=(15,8))
plt.scatter(stations_w_bookings["lon"], stations_w_bookings["lat"], s=stations_w_bookings["booking_count"]/50)
plt.scatter(stations_w_ev_bookings["lon"], stations_w_ev_bookings["lat"], s=stations_w_ev_bookings["booking_count"]/50)


#### EVS

In [None]:
plt.figure(figsize=(15,8))
plt.scatter(stations_w_bookings["lon"], stations_w_bookings["lat"])
plt.scatter(stations_w_ev_bookings["lon"], stations_w_ev_bookings["lat"])

#### Stats for booking count by station

In [None]:
sorted_by_booking_count = stations_w_bookings.sort_values(by="booking_count", ascending=False)
sorted_by_booking_count.head(20)

In [None]:
for i in range(10):
    print(np.quantile(sorted_by_booking_count["booking_count"].values, (i+1) / 10))

In [None]:
sorted_by_booking_count.iloc[:20]

In [None]:
np.median(sorted_by_booking_count["booking_count"].values)

### Free floating

In [None]:
free_floating = reservation[reservation["tripmode"] == "FreeFloating (Rückgabe an einem beliebigen Ort)"]

In [None]:
free_floating.merge(station, left_on="start_station_no", right_on = "station_no")

### Cars

In [None]:
nr_stations_per_car = []
for car, car_df in reservation.groupby("vehicle_no"):
    all_stations = car_df["start_station_no"].values
    nr_stations_per_car.append(len(np.unique(all_stations)))


In [None]:
uni, counts = np.unique(nr_stations_per_car, return_counts=True)


In [None]:
print(counts[0] / np.sum(counts), "% cars are only at one station") 
print(sum(counts[:2]) / np.sum(counts), "% cars are only used from two stations") 
print(sum(counts[:3]) / np.sum(counts), "% cars are only used from three stations")


## Flexibilities

How many hours do vehicles stand vs are reserved vs are in use?

In [None]:
res_duration_h = (reservation["reservationto"].apply(convert_to_timestamp) - reservation["reservationfrom"].apply(convert_to_timestamp)) / 3600

In [None]:
drive_duration_h = (reservation["drive_lastend"].apply(convert_to_timestamp) - reservation["drive_firststart"].apply(convert_to_timestamp)) / 3600


In [None]:
reservation["res_duration"] = res_duration_h
reservation["drive_duration"] = drive_duration_h

In [None]:
total_hours = len(mapping)*24

In [None]:
grouped_by_car = reservation.groupby("vehicle_no").agg({"res_duration":"sum", "drive_duration":"sum"}) # , "energytypegroup":"first"})


In [None]:
grouped_by_car

#### Plot histograms of use per car

In [None]:
plt.hist(grouped_by_car["res_duration"] / len(mapping), bins=1000)
# plt.yscale("log")
plt.xlabel("Hours in use - per day (reservation)")
plt.ylabel("Number of vehicles")
plt.xlim(0, 24)
plt.show()

In [None]:
plt.hist(grouped_by_car["drive_duration"] / len(mapping), bins=1000)
# plt.yscale("log")
plt.xlabel("Hours in use - per day (drive)")
plt.ylabel("Number of vehicles")
plt.xlim(0, 24)
plt.show()

In [None]:
for i in range(1,6):
    car_use_per_day = grouped_by_car["res_duration"] / len(mapping)
    print(sum(car_use_per_day < i) / len(car_use_per_day), "are used less than", i, "hours per day")

In [None]:
melted_for_plot = grouped_by_car.melt()
melted_for_plot["hours_per_day"] = melted_for_plot["value"] / len(mapping)

In [None]:
melted_for_plot = grouped_by_car.melt()
melted_for_plot["hours_per_day"] = melted_for_plot["value"] / len(mapping)
sns.violinplot(x="variable", y="hours_per_day", data=melted_for_plot)
plt.ylim(0, 24)

#### How many days are they actually in use?

In [None]:
convert_to_day = lambda x: mapping[x.split(" ")[0]] if int(x[:4])>2018 and not (int(x[:4])==2020 and int(x[5:7])>=8) else pd.NA
# get days
reservation["reservationfrom_day"] = reservation["reservationfrom"].apply(convert_to_day)
reservation["reservationto_day"] = reservation["reservationto"].apply(convert_to_day)

In [None]:
def get_list(x):
    if pd.isna(x["reservationfrom_day"]) or pd.isna(x["reservationto_day"]):
        return []
    else:
        return list(np.arange(x["reservationfrom_day"], x["reservationto_day"]+1, 1))
    
veh_days_dict = {}
for veh, veh_df in reservation.groupby("vehicle_no"):
    veh_df["day_list"] = veh_df.apply(get_list, axis=1)
    flat_day_list = [day for day_list in veh_df["day_list"].values for day in day_list]
    set_of_days = set(flat_day_list)
    veh_days_dict[veh] = len(set_of_days)

In [None]:
temp = [[key, val] for key, val in veh_days_dict.items()]
veh_df = pd.DataFrame(temp, columns=["vehicle_no", "no_days"]).set_index("vehicle_no")

In [None]:
plt.figure(figsize=(10,8))
plt.hist(veh_df/len(mapping), bins=100)
plt.ylabel("Number of vehicles")
plt.xlabel("Ratio of days that the vehicle is in use")
plt.show()

In [None]:
car_val = veh_df["no_days"].values / len(mapping)
for i in range(1,6):
    print(sum(car_val < i/10) / len(car_val), "are used less than", i*10, "% of all days")

## When are the flexibilities in a day?

In [None]:
valid_reservations = reservation[pd.isna(reservation["canceldate"])]

In [None]:
counter = 0
mapping_hours = {}
for day in range(len(mapping)):
    for hour in range(24):
        mapping_hours[(day, hour)] = counter
        counter += 1 

In [None]:
count_bookings = np.zeros(len(mapping)*24)
for i, row in valid_reservations.iterrows():
    if pd.isna(row["reservationfrom"]) or pd.isna(row["reservationto"]):
        continue

    date_start = row["reservationfrom"][:10]
    date_end = row["reservationto"][:10]
    hour_start = int(row["reservationfrom"][11:13])
    hour_end = int(row["reservationto"][11:13])
    
    if int(date_start[:4]) < 2019:
        start_ind = 0
    else:
        start_ind = mapping_hours[(mapping[date_start], hour_start)]
    if int(date_end[:4]) == 2020 and int(date_end[5:7]) >= 8:
        end_ind = len(mapping_hours) - 1
        print(date_start, date_end, hour_end)
    else:
        end_ind = mapping_hours[(mapping[date_end], hour_end)]
    
    
    if end_ind < start_ind:
        print("end before start!")
        continue
        
    count_bookings[start_ind:end_ind+1] += 1

In [None]:
count_bookings_per_day = np.reshape(count_bookings, (len(mapping), 24))

In [None]:
plt.figure(figsize=(20,5))
plt.plot(count_bookings)
plt.show()

In [None]:
np.mean(count_bookings_per_day) / len(np.unique(reservation["vehicle_no"].values)), len(np.unique(reservation["vehicle_no"].values))


In [None]:
plt.figure(figsize=(8,5))
plt.plot(np.mean(count_bookings_per_day, axis=0))
plt.title("Usage by hour")

## Number of used vehicles per month

In [None]:
unique_vehicles_per_month, unique_stations_per_month = [], []
ev_bookings, ev_vehicles = [], []

distinct = lambda x: len(np.unique(x))
distinct_vehicles_at_station_ever = reservation.groupby("start_station_no").agg({"vehicle_no":distinct})

years_and_months = []
for year in [2019, 2020]:
    for month_int in range(12):
        if year == 2020 and month_int>6:
            break
        month_start = "%02d" %(month_int+1)
        month_ts_start = convert_to_timestamp(f'{year}-{month_start}-01 00:00:00.000')
        if month_int < 11:
            month = "%02d" %(month_int+2)
            month_ts_end = convert_to_timestamp(f'{year}-{month}-01 00:00:00.000')
        else: 
            month_ts_end = convert_to_timestamp(f'{year+1}-01-01 00:00:00.000')
            
        cond_smaller = reservation["reservation_from_ts"] < month_ts_end
        cond_bigger = reservation["reservation_from_ts"] >= month_ts_start
        month_df = reservation[cond_smaller & cond_bigger]
        
        # Test: print(month_df["reservationfrom"].values)
        num_unique_vehicles = np.unique(month_df["vehicle_no"].values)
        unique_vehicles_per_month.append(len(num_unique_vehicles))
        unique_stations_per_month.append(len(np.unique(month_df["start_station_no"].values)))
    
        # Number of bookings and number of vehicles EV
        only_ev = month_df[month_df["energytypegroup"] == "Electro"]
        ev_bookings.append(len(only_ev))
        ev_vehicles.append(len(np.unique(only_ev["vehicle_no"])))
        # print(only_ev[["reservationfrom", "energytypegroup"])
        
        # cars per station
        distinct = lambda x: len(np.unique(x))
        distinct_vehicles_at_station = month_df.groupby("start_station_no").agg({"vehicle_no":distinct})
        distinct_vehicles_at_station_ever = pd.merge(
            distinct_vehicles_at_station_ever, distinct_vehicles_at_station, how="outer", left_index=True, right_index=True, suffixes=("", str(month_int))
        )
        
        years_and_months.append(f"{year}-{month_start}")

In [None]:
plt.figure(figsize=(15,9))
plt.subplot(2,2,1)
plt.plot(unique_vehicles_per_month)
plt.xticks(np.arange(19), years_and_months, rotation=90)
plt.title("Number of vehicles used in each month")
plt.subplot(2,2,2)
plt.plot(unique_stations_per_month)
plt.xticks(np.arange(19), years_and_months, rotation=90)
plt.title("Number of stations used in each month")
plt.subplot(2,2,3)
plt.plot(ev_bookings)
plt.xticks(np.arange(19), years_and_months, rotation=90)
plt.title("Bookings of EVs")
plt.subplot(2,2,4)
plt.plot(ev_vehicles)
plt.xticks(np.arange(19), years_and_months, rotation=90)
plt.title("Electric vehicles used in each month")
plt.tight_layout()

## First appearance of each EV in the data

In [None]:
only_ev = reservation[reservation["energytypegroup"] == "Electro"]

In [None]:
only_ev_grouped = only_ev.groupby("vehicle_no").agg({"reservationfrom":"min", "reservation_from_ts":"min"})

In [None]:
new_per_month = []
for year in [2019, 2020]:
    for month_int in range(12):
        if year == 2020 and month_int>6:
            break
        month_start = "%02d" %(month_int+1)
        month_ts_start = convert_to_timestamp(f'{year}-{month_start}-01 00:00:00.000')
        if month_int < 11:
            month = "%02d" %(month_int+2)
            month_ts_end = convert_to_timestamp(f'{year}-{month}-01 00:00:00.000')
        else: 
            month_ts_end = convert_to_timestamp(f'{year+1}-01-01 00:00:00.000')
            
        cond_smaller = only_ev_grouped["reservation_from_ts"] < month_ts_end
        month_df = only_ev_grouped[cond_smaller]
        
        new_per_month.append(len(month_df))

In [None]:
plt.plot(new_per_month)
plt.xticks(np.arange(19), years_and_months, rotation=90)
plt.title("EVs appearing up to this month")

## How many cars per station

In [None]:
distinct = lambda x: len(np.unique(x))
distinct_vehicles_at_station_ever = reservation.groupby("start_station_no").agg({"vehicle_no":distinct})

In [None]:
for i in range(5):
    print(f"Ratio of vehicles with less than {i+1} cars:", sum(distinct_vehicles_at_station_ever["vehicle_no"] <= i+1) / len(distinct_vehicles_at_station_ever))
    

In [None]:
# Auswertung aus der oberen for loop
distinct_vehicles_at_station_ever

In [None]:
median_veh_per_station = distinct_vehicles_at_station_ever.median(axis=1)
plt.hist(median_veh_per_station[median_veh_per_station<20])
# plt.yscale("log")
plt.ylabel("Number of station")
plt.xlabel("Vehicle count")
plt.title("Median of monthly number of vehicles per station")

In [None]:
for i in range(5):
    print(f"Ratio of stations with {i+1} cars:", round(sum(median_veh_per_station == i+1) / len(median_veh_per_station), 2))
    

## Driven km by energytypegroup

In [None]:
res_filtered = reservation[(np.absolute(reservation["drive_km"]) < 400) & (reservation["drive_km"] > 0)]
print(min(res_filtered["drive_km"]), max(res_filtered["drive_km"]))

In [None]:
plt.figure(figsize=(15, 5))
sns.kdeplot(data=res_filtered, x="drive_km", hue="energytypegroup", common_norm=False)

In [None]:
reservation[reservation["drive_km"]<0][["reservationtype", "reservationstate", "tripmode", "drive_km", "canceldate"]]

In [None]:
negative_drive_km = reservation[reservation["drive_km"]<0].reset_index()[
    ["reservation_no", "drive_km","syscreatedate","reservationfrom", "reservationto", "canceldate"]
]
negative_drive_km.to_csv("data/examples_for_questions/negative_drive_km.csv")

In [None]:
res_filtered.groupby("energytypegroup").agg({"drive_km":["mean", "median", "std"]})

### Problem: Stations have noisy lon and lat --> create example for Andreas to check

In [None]:
data_bookings = pd.read_csv("data/20211213_ethz_reservation/20211213_ethz_reservation_20190101_20190131.tsv", sep="\t")


In [None]:
start_cols = ['BASESTART_NO', 'BASESTART_NAME', 'BASESTART_LAT', 'BASESTART_LON']
end_cols = ['BASEEND_NO', 'BASEEND_NAME','BASEEND_LAT', 'BASEEND_LON']

data_stations = data_bookings[start_cols]
rename_dict = {c: c.split("_")[-1] for c in start_cols}
data_stations = data_stations.rename(columns=rename_dict)
add_end_stations =  data_bookings[end_cols]
rename_dict = {c: c.split("_")[-1] for c in end_cols}
add_end_stations = add_end_stations.rename(columns=rename_dict)
data_stations = pd.concat([data_stations, add_end_stations])

In [None]:
for no, df in data_stations.groupby("NO"):
    # if no == 1006:
    # print(df.head(50))
    rounded_lat = df["LAT"].apply(lambda x: round(x, 2))
    rounded_lon = df["LON"].apply(lambda x: round(x, 2))
    if len(np.unique(rounded_lat.values))>1 or len(np.unique(rounded_lon.values))>1:
        if no == 1048:
            df.to_csv("data/examples_for_questions/station_lon_lat.csv")