In [None]:
# Our data set contains 181 folders full of plt files. Each plt file has information from devices or apps that are recording location information.
# Data Visualization
import os
import pandas as pd
import matplotlib.pyplot as plt

# Example of a single line in my datase: 40.008304,116.319876,0,492,39745.0902662037,2008-10-24,02:09:59
def read_location_information_file(file_path):
    columns = ['latitude', 'longitude', '0', 'altitude', 'num_days', 'date', 'time']
    # file_path - path to our file
    # skiprows - it will skip the first 6 rows
    # names - columns that must be used
    df = pd.read_csv(file_path, skiprows=6, header=None, names=columns, delimiter=',')
    return df[['latitude', 'longitude', 'altitude', 'date', 'time']]


def trajectory_visualization(df_trajectory, title="Trajectory"):
    plt.figure(figsize=(10,6))
    plt.plot(df_trajectory['longitude'], df_trajectory['latitude'],  marker='o', linestyle='-', color='b')
    plt.title(title)
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.grid(True)
    plt.show()

dataset_path = 'C:/Users/mvane/OneDrive/Desktop/IBD/Geolife Trajectories 1.3/Data'

for dataset_folder in os.listdir(dataset_path):
    data_path = os.path.join(dataset_path, dataset_folder)
    if os.path.isdir(data_path):
        trajectory_path = os.path.join(data_path, 'Trajectory')


    for trajectory_file in os.listdir(trajectory_path):
        if os.path.isfile(os.path.join(trajectory_path, trajectory_file)) and trajectory_file.endswith('.plt'):
            plt_file = os.path.join(trajectory_path, trajectory_file)
            trajectory_data = read_location_information_file(plt_file)
            # trajectory_visualization(trajectory_data, "Trajectory")
            break;
          

    

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt

# Data Preprocessing
dataset_path = 'C:\\Users\\mvane\\OneDrive\\Desktop\\IBD\\Geolife Trajectories 1.3/Data'

out_file_directory = 'C:\\Users\\mvane\\OneDrive\\Desktop\\IBD\\Geolife Trajectories 1.3/CleanedData'

# Create the output directory if it doesn't exist
if not os.path.exists(out_file_directory):
    os.makedirs(out_file_directory)

# Dealing with anormal coordinates
def handle_outliers(df):

    # Calculate IQR -> latitude
    q1_lat = df['latitude'].quantile(0.25)
    q3_lat = df['latitude'].quantile(0.75)
    IQR_lat = q3_lat - q1_lat

    # Calculate IQR -> Longitude
    q1_lon = df['longitude'].quantile(0.25)
    q3_lon = df['longitude'].quantile(0.75)
    IQR_lon = q3_lon - q1_lon

    # Define threadholds for anormal values
    alpha = 1.5 # is a constant number, traaditional one, it identify just very extrem values
    lower_bound_lat = q1_lat - alpha * IQR_lat
    upper_bound_lat = q3_lat + alpha * IQR_lat
    lower_bound_lon = q1_lon - alpha * IQR_lon
    upper_bound_lon = q3_lon + alpha * IQR_lon


    # Save outliers
    outliers = df[
        (df['latitude'] < lower_bound_lat) | (df['latitude'] > upper_bound_lat) |
        (df['longitude'] < lower_bound_lon) | (df['longitude'] > upper_bound_lon)
    ]

    #  Get rid of anormal data
    clean_df = df[~df.isin(outliers)].dropna()

    return clean_df;

    

for dataset_folder in os.listdir(dataset_path):
    data_path = os.path.join(dataset_path, dataset_folder)
    if os.path.isdir(data_path):
        trajectory_path = os.path.join(data_path, 'Trajectory')

        for trajectory_file in os.listdir(trajectory_path):
            columns = ['latitude', 'longitude', '0', 'altitude', 'num_days', 'date', 'time']
            if os.path.isfile(os.path.join(trajectory_path, trajectory_file)) and trajectory_file.endswith('.plt'):
                plt_file = os.path.join(trajectory_path, trajectory_file)
                new_dir = os.path.join(out_file_directory, dataset_folder)
                if not os.path.exists(new_dir):
                    os.makedirs(new_dir)

                cleaned_file_name = f"{trajectory_file.replace('.plt', '_cleaned.plt')}"
                out_dir = os.path.join(new_dir, cleaned_file_name)
                df = pd.read_csv(plt_file, skiprows=6, header=None, names=columns, delimiter=',')

                # Drop duplicates
                df = df.drop_duplicates()
                
                # Remove missing data
                df = df.dropna()
                cleaned_data = handle_outliers(df)
                cleaned_data.to_csv(out_dir, index=False)

