In [1]:
import pandas as pd

In [6]:
df = pd.read_csv('../data/islamabad_osm_data.csv')

In [7]:
print("Original dataset shape:", df.shape)

Original dataset shape: (22950, 17)


In [8]:
# Define Islamabad bounding box coordinates
# Format: min_lat, min_lon, max_lat, max_lon
min_lat, min_lon, max_lat, max_lon = 33.4734, 72.8397, 33.7480, 73.2047

In [9]:
# Filter data within Islamabad bounding box
islamabad_filter = (
    (df['latitude'] >= min_lat) & 
    (df['latitude'] <= max_lat) & 
    (df['longitude'] >= min_lon) & 
    (df['longitude'] <= max_lon)
)
df_cleaned = df[islamabad_filter].copy()

In [10]:
print(f"After coordinate filtering: {df_cleaned.shape}")
print(f"Removed {df.shape[0] - df_cleaned.shape[0]} rows ({((df.shape[0] - df_cleaned.shape[0])/df.shape[0]*100):.1f}%)")

After coordinate filtering: (5240, 17)
Removed 17710 rows (77.2%)


In [11]:
# Data quality checks
print("\nMissing values in key columns:")
for col in ['description', 'osm_tag_key', 'osm_tag_value', 'latitude', 'longitude']:
    missing = df_cleaned[col].isnull().sum()
    print(f"{col}: {missing} ({missing/len(df_cleaned)*100:.1f}%)")


Missing values in key columns:
description: 0 (0.0%)
osm_tag_key: 0 (0.0%)
osm_tag_value: 0 (0.0%)
latitude: 0 (0.0%)
longitude: 0 (0.0%)


In [12]:
print(f"\nDuplicate rows: {df_cleaned.duplicated().sum()}")


Duplicate rows: 0


In [13]:
print("\nLanguage distribution:")
print(df_cleaned['language'].value_counts())

print("\nTop OSM tag keys:")
print(df_cleaned['osm_tag_key'].value_counts().head(10))

print("\nTop OSM tag values:")
print(df_cleaned['osm_tag_value'].value_counts().head(10))


Language distribution:
language
English    4679
Urdu        560
Mixed         1
Name: count, dtype: int64

Top OSM tag keys:
osm_tag_key
amenity    3246
shop       1086
tourism     559
leisure     349
Name: count, dtype: int64

Top OSM tag values:
osm_tag_value
place_of_worship    514
restaurant          464
bank                413
school              379
park                197
fuel                191
fast_food           140
supermarket         139
college             135
hospital            134
Name: count, dtype: int64


In [14]:
# Remove rows with missing critical information
df_final = df_cleaned.dropna(subset=['description', 'osm_tag_key', 'osm_tag_value', 'latitude', 'longitude']).copy()

print(f"\nFinal dataset shape: {df_final.shape}")


Final dataset shape: (5240, 17)


In [15]:
# Save cleaned data
df_final.to_excel('../data/osm_islamabad_cleaned.xlsx', index=False)
df_final.to_csv('../data/osm_islamabad_cleaned.csv', index=False)

print("\n✅ Cleaned datasets saved as 'osm_islamabad_cleaned.xlsx' and 'osm_islamabad_cleaned.csv' in the data directory.")


✅ Cleaned datasets saved as 'osm_islamabad_cleaned.xlsx' and 'osm_islamabad_cleaned.csv' in the data directory.


In [16]:
df =pd.read_csv('../data/osm_islamabad_cleaned.csv')

In [19]:
print("First 5 rows of the cleaned Islamabad data:")
print(df_cleaned.head())

First 5 rows of the cleaned Islamabad data:
     osm_id element_type   category                description  \
0  61146664         node  amenities   عسکری بینک (Askari Bank)   
1  61146673         node  amenities    الائیڈ بینک لمیٹڈ (ABL)   
2  61146691         node  amenities            United Bank Ltd   
3  61146835         node  amenities  Total Parco , F-10 Markaz   
4  98166308         node  amenities     Caltex Filling Station   

                        name      name_en            name_ur language  \
0                 عسکری بینک  Askari Bank         عسکری بینک     Urdu   
1          الائیڈ بینک لمیٹڈ          ABL  الائیڈ بینک لمیٹڈ     Urdu   
2            United Bank Ltd          NaN                NaN  English   
3  Total Parco , F-10 Markaz          NaN                NaN  English   
4     Caltex Filling Station          NaN                NaN  English   

    location osm_tag_key osm_tag_value source  \
0  Islamabad     amenity          bank    OSM   
1  Islamabad     ameni

In [20]:
# Display basic information about the DataFrame (columns, non-null counts, dtypes)
print("\nDataFrame Info:")
df_cleaned.info()


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 5240 entries, 0 to 22916
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   osm_id         5240 non-null   int64  
 1   element_type   5240 non-null   object 
 2   category       5240 non-null   object 
 3   description    5240 non-null   object 
 4   name           5205 non-null   object 
 5   name_en        853 non-null    object 
 6   name_ur        546 non-null    object 
 7   language       5240 non-null   object 
 8   location       5240 non-null   object 
 9   osm_tag_key    5240 non-null   object 
 10  osm_tag_value  5240 non-null   object 
 11  source         5240 non-null   object 
 12  coordinates    5240 non-null   object 
 13  latitude       5240 non-null   float64
 14  longitude      5240 non-null   float64
 15  all_tags       5240 non-null   object 
 16  collected_at   5240 non-null   object 
dtypes: float64(2), int64(1), object(14)
mem

In [21]:
# Display descriptive statistics for numerical columns
print("\nDescriptive Statistics:")
print(df_cleaned.describe())

# You can also check specific columns if you have concerns
# For example, to check unique values in 'language'
print("\nUnique values in 'language' column:")
print(df_cleaned['language'].value_counts())

# Or to check unique values in 'osm_tag_key'
print("\nUnique values in 'osm_tag_key' column (top 10):")
print(df_cleaned['osm_tag_key'].value_counts().head(10))


Descriptive Statistics:
             osm_id     latitude    longitude
count  5.240000e+03  5240.000000  5240.000000
mean   4.275751e+09    33.648465    73.054246
std    3.227400e+09     0.056312     0.055212
min    8.058334e+06    33.474125    72.840721
25%    1.295250e+09    33.613044    73.018659
50%    4.880224e+09    33.652743    73.057927
75%    5.617898e+09    33.693306    73.082034
max    1.300255e+10    33.747926    73.204302

Unique values in 'language' column:
language
English    4679
Urdu        560
Mixed         1
Name: count, dtype: int64

Unique values in 'osm_tag_key' column (top 10):
osm_tag_key
amenity    3246
shop       1086
tourism     559
leisure     349
Name: count, dtype: int64


In [22]:
df.columns

Index(['osm_id', 'element_type', 'category', 'description', 'name', 'name_en',
       'name_ur', 'language', 'location', 'osm_tag_key', 'osm_tag_value',
       'source', 'coordinates', 'latitude', 'longitude', 'all_tags',
       'collected_at'],
      dtype='object')

In [23]:
df.describe()

Unnamed: 0,osm_id,latitude,longitude
count,5240.0,5240.0,5240.0
mean,4275751000.0,33.648465,73.054246
std,3227400000.0,0.056312,0.055212
min,8058334.0,33.474125,72.840721
25%,1295250000.0,33.613044,73.018659
50%,4880224000.0,33.652743,73.057927
75%,5617898000.0,33.693306,73.082034
max,13002550000.0,33.747926,73.204302


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5240 entries, 0 to 5239
Data columns (total 17 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   osm_id         5240 non-null   int64  
 1   element_type   5240 non-null   object 
 2   category       5240 non-null   object 
 3   description    5240 non-null   object 
 4   name           5205 non-null   object 
 5   name_en        853 non-null    object 
 6   name_ur        546 non-null    object 
 7   language       5240 non-null   object 
 8   location       5240 non-null   object 
 9   osm_tag_key    5240 non-null   object 
 10  osm_tag_value  5240 non-null   object 
 11  source         5240 non-null   object 
 12  coordinates    5240 non-null   object 
 13  latitude       5240 non-null   float64
 14  longitude      5240 non-null   float64
 15  all_tags       5240 non-null   object 
 16  collected_at   5240 non-null   object 
dtypes: float64(2), int64(1), object(14)
memory usage: 69

In [25]:
df.head(50)

Unnamed: 0,osm_id,element_type,category,description,name,name_en,name_ur,language,location,osm_tag_key,osm_tag_value,source,coordinates,latitude,longitude,all_tags,collected_at
0,61146664,node,amenities,عسکری بینک (Askari Bank),عسکری بینک,Askari Bank,عسکری بینک,Urdu,Islamabad,amenity,bank,OSM,"[Decimal('73.0112954'), Decimal('33.6980860')]",33.698086,73.011295,"{'addr:city': 'اسلام آباد', 'addr:city:ar': 'إ...",2025-07-29T08:08:59.744749
1,61146673,node,amenities,الائیڈ بینک لمیٹڈ (ABL),الائیڈ بینک لمیٹڈ,ABL,الائیڈ بینک لمیٹڈ,Urdu,Islamabad,amenity,bank,OSM,"[Decimal('73.0105887'), Decimal('33.6976525')]",33.697652,73.010589,"{'addr:city': 'اسلام آباد', 'addr:city:ar': 'إ...",2025-07-29T08:08:59.744749
2,61146691,node,amenities,United Bank Ltd,United Bank Ltd,,,English,Islamabad,amenity,bank,OSM,"[Decimal('73.0118772'), Decimal('33.6960411')]",33.696041,73.011877,"{'addr:city': 'اسلام آباد', 'addr:city:ar': 'إ...",2025-07-29T08:08:59.744749
3,61146835,node,amenities,"Total Parco , F-10 Markaz","Total Parco , F-10 Markaz",,,English,Islamabad,amenity,fuel,OSM,"[Decimal('73.0112495'), Decimal('33.6988403')]",33.69884,73.01125,"{'addr:city': 'اسلام آباد', 'addr:city:ar': 'إ...",2025-07-29T08:08:59.744749
4,98166308,node,amenities,Caltex Filling Station,Caltex Filling Station,,,English,Islamabad,amenity,fuel,OSM,"[Decimal('73.0944042'), Decimal('33.5846699')]",33.58467,73.094404,"{'amenity': 'fuel', 'fuel:cng': 'yes', 'name':...",2025-07-29T08:08:59.744749
5,211325636,node,amenities,Daman-e-Koh,Daman-e-Koh,,,English,Islamabad,amenity,restaurant,OSM,"[Decimal('73.0570403'), Decimal('33.7383666')]",33.738367,73.05704,"{'addr:street': 'Damn-i-koh Road', 'amenity': ...",2025-07-29T08:08:59.744749
6,212538228,node,amenities,اسٹینڈرڈ چارٹرڈ (Standard Chartered),اسٹینڈرڈ چارٹرڈ,Standard Chartered,اسٹینڈرڈ چارٹرڈ,Urdu,Islamabad,amenity,bank,OSM,"[Decimal('72.9868260'), Decimal('33.6837660')]",33.683766,72.986826,"{'addr:city': 'اسلام آباد', 'addr:city:ar': 'إ...",2025-07-29T08:08:59.744749
7,225578480,node,amenities,پاکستان اسٹیٹ آئل (Pakistan State Oil),پاکستان اسٹیٹ آئل,Pakistan State Oil,پاکستان اسٹیٹ آئل,Urdu,Islamabad,amenity,fuel,OSM,"[Decimal('73.0389002'), Decimal('33.7100027')]",33.710003,73.0389,"{'addr:city': 'اسلام آباد', 'addr:city:ar': 'إ...",2025-07-29T08:08:59.744749
8,232641948,node,amenities,PSO Pump G-9 Markez (Pakistan State Oil) [پاکس...,PSO Pump G-9 Markez,Pakistan State Oil,پاکستان اسٹیٹ آئل,Urdu,Islamabad,amenity,fuel,OSM,"[Decimal('73.0337460'), Decimal('33.6893963')]",33.689396,73.033746,"{'amenity': 'fuel', 'brand': 'پاکستان اسٹیٹ آئ...",2025-07-29T08:08:59.744749
9,234511823,node,amenities,اسٹینڈرڈ چارٹرڈ (Standard Chartered),اسٹینڈرڈ چارٹرڈ,Standard Chartered,اسٹینڈرڈ چارٹرڈ,Urdu,Islamabad,amenity,bank,OSM,"[Decimal('73.0113365'), Decimal('33.6966838')]",33.696684,73.011336,"{'amenity': 'bank', 'atm': 'yes', 'brand': 'اس...",2025-07-29T08:08:59.744749
