In [1]:
import pandas as pd
import numpy as np
import re
import string
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag

In [2]:
# Download required NLTK data (run once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\stran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\stran\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\stran\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\stran\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
df = pd.read_csv('../data/islamabad_extracted_phrases_cleaned.csv')

In [4]:
print(f"Original dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\nFirst few rows:")
print(df.head())

Original dataset shape: (418, 10)
Columns: ['description', 'sentence_context', 'phrase_type', 'language', 'location', 'osm_tag_key', 'osm_tag_value', 'source', 'coordinates', 'extracted_at']

First few rows:
         description                                   sentence_context  \
0  the northern part  Some people might want to skip the city and ju...   
1  any other capital  Islamabad is not just like any other capital c...   
2       other cities  The city is unique and it is very different fr...   
3   beautiful places  There are a lot of things to do and beautiful ...   
4      the best time      When is the best time to travel to Islamabad?   

      phrase_type language   location  osm_tag_key  osm_tag_value  \
0  adjective_noun  English  Islamabad          NaN            NaN   
1  adjective_noun  English  Islamabad          NaN            NaN   
2  adjective_noun  English  Islamabad          NaN            NaN   
3  adjective_noun  English  Islamabad          NaN            NaN

In [24]:
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\stran\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\stran\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


True

In [25]:
# Initialize preprocessing tools
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Add custom stopwords for geospatial context
custom_stopwords = {'place', 'area', 'location', 'spot', 'site', 'near', 'close', 'around'}
stop_words.update(custom_stopwords)

def clean_text(text):
    """Clean and normalize text"""
    if pd.isna(text):
        return ""

    # Convert to lowercase
    text = str(text).lower()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)

    # Remove extra whitespace and newlines
    text = re.sub(r'\s+', ' ', text)

    # Remove punctuation but keep hyphens in compound words
    text = re.sub(r'[^\w\s-]', ' ', text)

    return text.strip()

def extract_adjective_noun_patterns(text):
    """Extract adjective-noun patterns for OSM tagging"""
    if not text:
        return []

    tokens = word_tokenize(text)
    pos_tags = pos_tag(tokens)

    patterns = []
    for i in range(len(pos_tags) - 1):
        current_word, current_pos = pos_tags[i]
        next_word, next_pos = pos_tags[i + 1]

        # Extract adjective-noun patterns
        if current_pos.startswith('JJ') and next_pos.startswith('NN'):
            if current_word not in stop_words and next_word not in stop_words:
                pattern = f"{current_word}_{next_word}"
                patterns.append(pattern)

    return patterns

def preprocess_description(text):
    """Complete preprocessing pipeline for descriptions"""
    # Clean text
    cleaned = clean_text(text)

    if not cleaned:
        return "", []

    # Tokenize and remove stopwords
    tokens = word_tokenize(cleaned)
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]

    # Lemmatize
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join back to text
    processed_text = ' '.join(tokens)

    # Extract patterns
    patterns = extract_adjective_noun_patterns(cleaned)

    return processed_text, patterns

# Now you can run your preprocessing lines:
print("\nPreprocessing descriptions...")
df['cleaned_description'] = df['description'].apply(lambda x: preprocess_description(x)[0])
df['adjective_noun_patterns'] = df['description'].apply(lambda x: preprocess_description(x)[1])


Preprocessing descriptions...


In [26]:
df = df[df['cleaned_description'].str.len() > 0]

In [27]:
# Create pattern frequency analysis
all_patterns = []
for patterns in df['adjective_noun_patterns']:
    all_patterns.extend(patterns)

pattern_freq = Counter(all_patterns)
print(f"\nTop 10 most frequent adjective-noun patterns:")
for pattern, freq in pattern_freq.most_common(10):
    print(f"{pattern}: {freq}")


Top 10 most frequent adjective-noun patterns:
beautiful_places: 5
largest_mosque: 3
famous_places: 3
beautiful_view: 3
national_park: 3
local_sim: 3
northern_part: 2
best_time: 2
good_restaurants: 2
popular_picnic: 2


In [29]:
# Standardize OSM tags (if columns exist)
if 'osm_tag_key' in df.columns:
    df['osm_tag_key'] = df['osm_tag_key'].astype(str).str.lower().str.strip()

if 'osm_tag_value' in df.columns:
    df['osm_tag_value'] = df['osm_tag_value'].astype(str).str.lower().str.strip()

In [30]:
# Handle missing values
print(f"\nMissing values before cleaning:")
print(df.isnull().sum())


Missing values before cleaning:
description                  0
sentence_context             0
phrase_type                  0
language                     0
location                     0
osm_tag_key                  0
osm_tag_value                0
source                       0
coordinates                371
extracted_at                 0
cleaned_description          0
adjective_noun_patterns      0
dtype: int64


In [31]:
if 'coordinates' in df.columns:
    df['coordinates'] = df['coordinates'].fillna('[73.0479, 33.6844]')  # Islamabad center

In [32]:
# Remove duplicates based on cleaned description
df_deduplicated = df.drop_duplicates(subset=['cleaned_description'], keep='first')
print(f"\nRemoved {len(df) - len(df_deduplicated)} duplicate descriptions")


Removed 54 duplicate descriptions


In [33]:
# Create additional features for analysis
df_deduplicated['description_length'] = df_deduplicated['cleaned_description'].str.len()
df_deduplicated['word_count'] = df_deduplicated['cleaned_description'].str.split().str.len()
df_deduplicated['pattern_count'] = df_deduplicated['adjective_noun_patterns'].str.len()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_deduplicated['description_length'] = df_deduplicated['cleaned_description'].str.len()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_deduplicated['word_count'] = df_deduplicated['cleaned_description'].str.split().str.len()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_deduplicated['patter

In [34]:
# Summary statistics
print(f"\nFinal dataset shape: {df_deduplicated.shape}")
print(f"\nDescription statistics:")
print(df_deduplicated[['description_length', 'word_count', 'pattern_count']].describe())


Final dataset shape: (317, 15)

Description statistics:
       description_length  word_count  pattern_count
count          317.000000  317.000000     317.000000
mean            12.567823    1.895899       0.577287
std              6.941389    0.874326       0.513601
min              3.000000    1.000000       0.000000
25%              9.000000    2.000000       0.000000
50%             12.000000    2.000000       1.000000
75%             15.000000    2.000000       1.000000
max             79.000000   11.000000       2.000000


In [35]:
df.head(10)

Unnamed: 0,description,sentence_context,phrase_type,language,location,osm_tag_key,osm_tag_value,source,coordinates,extracted_at,cleaned_description,adjective_noun_patterns
0,the northern part,Some people might want to skip the city and ju...,adjective_noun,English,Islamabad,,,https://travelwithmansoureh.com/blog/travel-gu...,"[73.0479, 33.6844]",2025-07-30T07:56:06.093655,northern part,[northern_part]
1,any other capital,Islamabad is not just like any other capital c...,adjective_noun,English,Islamabad,,,https://travelwithmansoureh.com/blog/travel-gu...,"[73.0479, 33.6844]",2025-07-30T07:56:06.093655,capital,[]
2,other cities,The city is unique and it is very different fr...,adjective_noun,English,Islamabad,,,https://travelwithmansoureh.com/blog/travel-gu...,"[73.0479, 33.6844]",2025-07-30T07:56:06.093655,city,[]
3,beautiful places,There are a lot of things to do and beautiful ...,adjective_noun,English,Islamabad,,,https://travelwithmansoureh.com/blog/travel-gu...,"[73.0479, 33.6844]",2025-07-30T07:56:06.093655,beautiful place,[beautiful_places]
4,the best time,When is the best time to travel to Islamabad?,adjective_noun,English,Islamabad,,,https://travelwithmansoureh.com/blog/travel-gu...,"[73.0479, 33.6844]",2025-07-30T07:56:06.093655,best time,[best_time]
5,the hottest time,Mid-July is considered the hottest time of the...,adjective_noun,English,Islamabad,,,https://travelwithmansoureh.com/blog/travel-gu...,"[73.0479, 33.6844]",2025-07-30T07:56:06.093655,hottest time,[hottest_time]
6,the hot seasons,It is best to avoid travelling to Pakistan dur...,adjective_noun,English,Islamabad,,,https://travelwithmansoureh.com/blog/travel-gu...,"[73.0479, 33.6844]",2025-07-30T07:56:06.093655,hot season,[hot_seasons]
7,Many tourists,Many tourists prefer to travel to Islamabad be...,adjective_noun,English,Islamabad,,,https://travelwithmansoureh.com/blog/travel-gu...,"[73.0479, 33.6844]",2025-07-30T07:56:06.093655,many tourist,[many_tourists]
8,the best months,"Moreover, these months are considered the best...",adjective_noun,English,Islamabad,,,https://travelwithmansoureh.com/blog/travel-gu...,"[73.0479, 33.6844]",2025-07-30T07:56:06.093655,best month,[best_months]
9,green city,Where to stay in Islamabad Islamabad is a lush...,adjective_noun,English,Islamabad,,,https://travelwithmansoureh.com/blog/travel-gu...,"[73.0479, 33.6844]",2025-07-30T07:56:06.093655,green city,[green_city]


In [36]:
# Save preprocessed data
output_file = '../data/islamabad_preprocessed.csv'
df_deduplicated.to_csv(output_file, index=False)
print(f"\nPreprocessed data saved to: {output_file}")


Preprocessed data saved to: ../data/islamabad_preprocessed.csv


In [38]:
# Display sample of preprocessed data
print(f"\nSample of preprocessed data:")
sample_cols = ['description', 'cleaned_description', 'adjective_noun_patterns']
if 'osm_tag_key' in df_deduplicated.columns:
    sample_cols.extend(['osm_tag_key', 'osm_tag_value'])
print(df_deduplicated[sample_cols].head())


Sample of preprocessed data:
         description cleaned_description adjective_noun_patterns osm_tag_key  \
0  the northern part       northern part         [northern_part]         nan   
1  any other capital             capital                      []         nan   
2       other cities                city                      []         nan   
3   beautiful places     beautiful place      [beautiful_places]         nan   
4      the best time           best time             [best_time]         nan   

  osm_tag_value  
0           nan  
1           nan  
2           nan  
3           nan  
4           nan  


In [39]:
# Create a pattern frequency DataFrame for further analysis
pattern_df = pd.DataFrame(pattern_freq.most_common(), columns=['pattern', 'frequency'])
pattern_df.to_csv('../data/islamabad_patterns_frequency.csv', index=False)
print(f"\nPattern frequency data saved to: ../data/islamabad_patterns_frequency.csv")


Pattern frequency data saved to: ../data/islamabad_patterns_frequency.csv
