In [85]:
# M.1.1: Checking CRS (Projection) of different data formats

import geopandas as gpd

def check_projection(files):
    projections = []

    for file in files:
        gdf = gpd.read_file(file)

        projection = gdf.crs
        projections.append(projection)

    if all(proj == projections[0] for proj in projections):
        print("Same projection system:", projections[0])
    else:
        print("Data are in different projection systems.")

vector_data_files = ['GeospatialDataMangement/shapefile1.shp', 'GeospatialDataMangement/Geojson1.geojson', 'GeospatialDataMangement/geopackage1.gpkg']

check_projection(vector_data_files)


Same projection system: EPSG:4326


In [None]:
# M.1.2: Checking Corrupted files

from PIL import Image
import PyPDF2
import cv2

def is_file_corrupted(file_path):
    file_extension = file_path.split('.')[-1].lower()

    if file_extension == 'jpg' or file_extension == 'jpeg':
        try:
            img = Image.open(file_path)
            img.verify()
            return False 
        except (IOError, SyntaxError):
            return True 

    elif file_extension == 'mp4':
        try:
            cap = cv2.VideoCapture(file_path)
            frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
            return frames <= 0  
        except Exception:
            return True 

    else:
        print(f"Unsupported file format: {file_extension}")
        return False  

jpeg_file_path = 'GeospatialDataMangement/1.jpg'
mp4_file_path = 'GeospatialDataMangement/7.mp4'

if is_file_corrupted(jpeg_file_path):
    print(f"The JPEG file {jpeg_file_path} is corrupted.")
else:
    print(f"The JPEG file {jpeg_file_path} is intact.")


if is_file_corrupted(mp4_file_path):
    print(f"The MP4 file {mp4_file_path} is corrupted.")
else:
    print(f"The MP4 file {mp4_file_path} is intact.")


In [100]:
!pip install scikit-image


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [87]:
# M 1.3: Data Cleaning (On-Non spatial Data i. e Attribute data)

import geopandas as gpd
import rasterio
from shapely.geometry import Point

def clean_and_process_vector_data(vector_data):
    vector_data = vector_data.dropna()

    vector_data = vector_data.to_crs("EPSG:4326")

    vector_data = vector_data.drop_duplicates()

    vector_data['geometry'] = vector_data['geometry'].apply(lambda geom: Point(geom.x, geom.y) if geom.is_empty else geom)

    return vector_data

def clean_and_process_raster_data(raster_data):
    with rasterio.open(input_file) as src:
        data = src.read(1)
        raster_data = exposure.equalize_hist(data)
    return raster_data

def pre_processing_module(input_file, subset_size=500):
    file_extension = input_file.split('.')[-1].lower()

    if file_extension == 'geojson':
        data = gpd.read_file(input_file).head(subset_size)
        data = clean_and_process_vector_data(data)

    elif file_extension == 'shp':
        data = gpd.read_file(input_file).head(subset_size)
        data = clean_and_process_vector_data(data)

    elif file_extension == 'kml':
        data = gpd.read_file(input_file, driver='KML').head(subset_size)
        data = clean_and_process_vector_data(data)

    elif file_extension == 'tif':
        with rasterio.open(input_file) as src:
            data = src.read(1)[:subset_size]
            data = clean_and_process_raster_data(data)
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")

    return data

input_file_path = "GeospatialDataMangement/Geojson1.geojson" 
processed_data = pre_processing_module(input_file_path)
print(processed_data)

     OBJECTID      gid           state      lat     long  \
0           1  4233336     Maharashtra  17.0076  74.2615   
1           2  4233337      Tamil Nadu  10.7886  79.1310   
2           3  4233338       Karnataka  16.1655  74.8214   
3           4  4233339  Andhra Pradesh  16.1849  81.1353   
4           5  4233340         Tripura  23.9967  91.9957   
..        ...      ...             ...      ...      ...   
495       496  4233830  Madhya Pradesh  23.4355  75.2854   
496       497  4233831          Odisha  21.7383  86.8424   
497       498  4233832       Jharkhand  22.7758  86.1836   
498       499  4233833          Punjab  31.8197  75.3739   
499       500  4233834   Uttar Pradesh  25.5709  81.7399   

                      geometry  
0    POINT (74.26150 17.00760)  
1    POINT (79.13100 10.78860)  
2    POINT (74.82140 16.16550)  
3    POINT (81.13530 16.18490)  
4    POINT (91.99570 23.99670)  
..                         ...  
495  POINT (75.28540 23.43550)  
496  POINT (86.

In [79]:
# M 1.4: Data Cleaning (On spatial Data i. e Geometric operations and spatial relationships)
import geopandas as gpd
from shapely.geometry import Point, LineString, Polygon, MultiPolygon
from shapely.ops import unary_union
from shapely.validation import explain_validity

def clean_and_process_vector_data(vector_data):
    # Checking and fixing invalid geometries
    invalid_geoms = vector_data[~vector_data.is_valid]['geometry']
    if not invalid_geoms.empty:
        print("Invalid geometries found. Attempting to fix...")
        vector_data['geometry'] = vector_data['geometry'].apply(lambda geom: geom.buffer(0) if not geom.is_valid else geom)
    
    self_intersecting_geoms = vector_data[vector_data['geometry'].type.isin(['Polygon', 'LineString', 'Point'])][vector_data['geometry'].is_simple == False]['geometry']
    if not self_intersecting_geoms.empty:
        print("Self-intersecting geometries found. Attempting to fix...")
        vector_data['geometry'] = vector_data['geometry'].apply(lambda geom: geom.buffer(0) if not geom.is_simple else geom)

    overlapping_geoms = vector_data[vector_data['geometry'].type == 'Polygon'][vector_data['geometry'].overlaps(unary_union(vector_data['geometry']))]['geometry']
    if not overlapping_geoms.empty:
        print("Overlapping polygons found. Attempting to fix...")
        vector_data['geometry'] = gpd.GeoSeries(unary_union(vector_data['geometry']))

    invalid_multipolygons = vector_data[vector_data['geometry'].type == 'MultiPolygon'][~vector_data['geometry'].is_valid]['geometry']
    if not invalid_multipolygons.empty:
        print("Invalid MultiPolygons found. Attempting to fix...")
        vector_data['geometry'] = vector_data['geometry'].apply(lambda geom: geom.buffer(0) if not geom.is_valid else geom)

    # validation
    invalid_geoms = vector_data[~vector_data.is_valid]['geometry']
    if not invalid_geoms.empty:
        explanation = explain_validity(invalid_geoms.iloc[0])
        raise ValueError(f"Invalid geometries present : {explanation}")

    return vector_data


def pre_processing_module(input_file, subset_size=500):
    file_extension = input_file.split('.')[-1].lower()

    if file_extension == 'geojson':
        data = gpd.read_file(input_file).head(subset_size)
        data = clean_and_process_vector_data(data)

    elif file_extension == 'shp':
        data = gpd.read_file(input_file).head(subset_size)
        data = clean_and_process_vector_data(data)

    elif file_extension == 'kml':
        data = gpd.read_file(input_file, driver='KML').head(subset_size)
        data = clean_and_process_vector_data(data)

    elif file_extension == 'tif':
        with rasterio.open(input_file) as src:
            data = src.read(1)[:subset_size]
            data = clean_and_process_raster_data(data)
    else:
        raise ValueError(f"Unsupported file type: {file_extension}")

    return data

input_file_path = "GeospatialDataMangement/Geojson1.geojson" 
processed_data = pre_processing_module(input_file_path)
print(processed_data)


     OBJECTID      gid           state      lat     long  \
0           1  4233336     Maharashtra  17.0076  74.2615   
1           2  4233337      Tamil Nadu  10.7886  79.1310   
2           3  4233338       Karnataka  16.1655  74.8214   
3           4  4233339  Andhra Pradesh  16.1849  81.1353   
4           5  4233340         Tripura  23.9967  91.9957   
..        ...      ...             ...      ...      ...   
495       496  4233830  Madhya Pradesh  23.4355  75.2854   
496       497  4233831          Odisha  21.7383  86.8424   
497       498  4233832       Jharkhand  22.7758  86.1836   
498       499  4233833          Punjab  31.8197  75.3739   
499       500  4233834   Uttar Pradesh  25.5709  81.7399   

                      geometry  
0    POINT (74.26150 17.00760)  
1    POINT (79.13100 10.78860)  
2    POINT (74.82140 16.16550)  
3    POINT (81.13530 16.18490)  
4    POINT (91.99570 23.99670)  
..                         ...  
495  POINT (75.28540 23.43550)  
496  POINT (86.

  result = super().__getitem__(key)
  result = super().__getitem__(key)


In [80]:
!pip install pygeohash


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [98]:
!pip install folium


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m23.3.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [97]:
# Geohashing based pre-processing

import geopandas as gpd
import folium
from folium.plugins import MarkerCluster
from shapely.geometry import Point
import pygeohash as pgh


def geohash_based_preprocessing(data, geohash_precision=5):

    # Generate geohash values
    data['geohash'] = data['geometry'].apply(lambda geom: pgh.encode(geom.y, geom.x, precision=geohash_precision))

    # Drop duplicates based on geohash values
    data = data.drop_duplicates(subset='geohash')

    # Remove geohash column as it's no longer needed
    data = data.drop(columns=['geohash'])

    return data

# Example usage:
input_file_path = "GeospatialDataMangement/Geojson1.geojson"
processed_data = pre_processing_module(input_file_path)
geohash_processed_data = geohash_based_preprocessing(processed_data)
print(geohash_processed_data)


mean_latitude = geohash_processed_data.geometry.y.mean()
mean_longitude = geohash_processed_data.geometry.x.mean()
map_center = [mean_latitude, mean_longitude]
mymap = folium.Map(location=map_center, zoom_start=10)

marker_cluster = MarkerCluster().add_to(mymap)

for idx, row in geohash_processed_data.iterrows():
    popup_text = f"Point ID: {idx}"
    folium.Marker([row['geometry'].y, row['geometry'].x], popup=popup_text).add_to(marker_cluster)

mymap



     OBJECTID      gid           state      lat     long  \
0           1  4233336     Maharashtra  17.0076  74.2615   
1           2  4233337      Tamil Nadu  10.7886  79.1310   
2           3  4233338       Karnataka  16.1655  74.8214   
3           4  4233339  Andhra Pradesh  16.1849  81.1353   
4           5  4233340         Tripura  23.9967  91.9957   
..        ...      ...             ...      ...      ...   
495       496  4233830  Madhya Pradesh  23.4355  75.2854   
496       497  4233831          Odisha  21.7383  86.8424   
497       498  4233832       Jharkhand  22.7758  86.1836   
498       499  4233833          Punjab  31.8197  75.3739   
499       500  4233834   Uttar Pradesh  25.5709  81.7399   

                      geometry  
0    POINT (74.26150 17.00760)  
1    POINT (79.13100 10.78860)  
2    POINT (74.82140 16.16550)  
3    POINT (81.13530 16.18490)  
4    POINT (91.99570 23.99670)  
..                         ...  
495  POINT (75.28540 23.43550)  
496  POINT (86.

In [84]:
# M 1.5: Meta Data Verification 

import geopandas as gpd
import rasterio

def verify_metadata(data, is_vector=True):
    if is_vector:
        print("(Coordinate Reference System)", data.crs)
        print("Extent or ROI", data.total_bounds)
        print("Geometry Types:", data.geom_type.value_counts())
    else:
        print("Raster Data Metadata:")
        print("----------------------")
        with rasterio.open(data) as src:
            print("CRS (Coordinate Reference System):")
            print(src.crs)
            print("\nExtent:")
            print(src.bounds)
            print("\nData Type:")
            print(src.dtypes[0])

input_file_path = "GeospatialDataMangement/Geojson1.geojson" 
processed_data = pre_processing_module(input_file_path)
verify_metadata(processed_data)


(Coordinate Reference System) EPSG:4326
Extent or ROI [69.6539  8.5139 95.6173 34.5231]
Geometry Types: Point    500
dtype: int64


  result = super().__getitem__(key)
  result = super().__getitem__(key)
