<a href="https://colab.research.google.com/github/mohammadbadi/CrimeAnalytics_Clustering/blob/main/Code%20Sections/5.0%20Loading%20Libraries%20and%20Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


### **5.0 Loading Libraries and Major Crime Indicator Dataset from TPS**

In [None]:
                                                                                  # Import necessary libraries
import itertools
import os
import pandas as pd
import time
import kagglehub
import warnings
from IPython.display import display, HTML
from kagglehub import KaggleDatasetAdapter
from google.colab import files

os.system('pip install openpyxl -qqq')                                            # Install openpyxl for Excel support
os.system('pip install tabulate -qqq')                                            # Install tabulate for cleaner table output

warnings.filterwarnings("ignore", category=DeprecationWarning)                    # Ignore Deprecation Warnings
warnings.filterwarnings("ignore", category=FutureWarning)                         # Ignore future warnings

file_path = "major-crime-indicators.csv"                                          # Set the file path to the filename with extension

crime_df = kagglehub.load_dataset(                                                # Load the latest version of the dataset from Kaggle
    kagglehub.KaggleDatasetAdapter.PANDAS,
    "mohammadbadi/crimes-in-toronto",                                             # Updated dataset handle
    file_path,
)

def format_message(message):                                                      # Function to format HTML messages
    return f"""
<div style="font-size: 18px; color: #333; font-weight: bold; padding: 10px;">
    {message}
</div>
"""

load_message = format_message(                                                    # Display HTML formatted message confirming that the dataset is loaded
    "Dataset <span style='color: blue;'>major-crime-indicators.csv</span> by <span style='color: slategray;'>Mohammad Badi</span> from Kaggle website is <span style='color: green;'>Successfully</span> loaded!"
)
display(HTML(load_message))

crime_df.to_csv("major-crime-indicators.csv", index=False)                        # Save the loaded dataset as a CSV file

save_message = format_message(                                                    # Display HTML formatted message confirming that the dataset is saved
    "Dataset saved in <span style='color: blue;'>current workspace</span> <span style='color: green;'>Successfully!</span>"
)
display(HTML(save_message))

major_crime_df = crime_df                                                         # Reusing the dataframe loaded earlier
def save_data(data_df, filename_base):
    csv_filename = f"{filename_base}.csv"                                         # Save as CSV
    data_df.to_csv(csv_filename, index=False)
    csv_msg = format_message(f"Data saved as CSV: <span style='color: blue;'>{csv_filename}</span>")
    display(HTML(csv_msg))
    excel_filename = f"{filename_base}.xlsx"                                      # Save as Excel
    data_df.to_excel(excel_filename, index=False, engine='openpyxl')
    excel_msg = format_message(f"Data saved as Excel: <span style='color: blue;'>{excel_filename}</span>")
    display(HTML(excel_msg))

save_data(major_crime_df, "Checking_Load_Time")                                   # Save the dataset as both CSV and Excel with name 'Checking_Load_Time'

def measure_read_time(file_path, file_type):                                      # Function to measure file reading time
    start_time = time.time()
    if file_type == "csv":
        pd.read_csv(file_path)
    elif file_type == "excel":
        pd.read_excel(file_path)
    end_time = time.time()
    return end_time - start_time

csv_time = measure_read_time('Checking_Load_Time.csv', "csv")                     # Measure read times
excel_time = measure_read_time('Checking_Load_Time.xlsx', "excel")

csv_time_color = "green" if csv_time < excel_time else "red"                      # Determine color coding for time messages
excel_time_color = "green" if excel_time < csv_time else "red"

csv_time_message = format_message(                                                # Display HTML formatted time messages
    f"Time taken to read <span style='color: blue;'>Checking_Load_Time CSV file</span>: <span style='color: {csv_time_color};'>{csv_time:.2f} seconds</span>"
)

excel_time_message = format_message(
    f"Time taken to read <span style='color: blue;'>Checking_Load_Time Excel file</span>: <span style='color: {excel_time_color};'>{excel_time:.2f} seconds</span>"
)

display(HTML(csv_time_message))                                                   # Display the time messages
display(HTML(excel_time_message))

if csv_time < excel_time:                                                         # Determine the recommendation based on time
    speed_factor = excel_time / csv_time
    recommendation = (
        f"Recommendation: Load the data from <span style='color: green;'>CSV</span> as it is approximately "
        f"<span style='color: green;'>{speed_factor:.2f} times faster</span> than loading from Excel."
    )
else:
    speed_factor = csv_time / excel_time
    recommendation = (
        f"Recommendation: Load the data from <span style='color: green;'>Excel</span> as it is approximately "
        f"<span style='color: green;'>{speed_factor:.2f} times faster</span> than loading from CSV."
    )

recommendation_message = format_message(recommendation)
display(HTML(recommendation_message))

completion_message = format_message("Dataset has been analyzed, and recommendation has been provided!")
display(HTML(completion_message))
