In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Preference dataset



In [None]:
df = pd.read_csv('Preference_Data.csv')

  df = pd.read_csv('Preference_Data.csv')


In [None]:
df = df.rename(columns={
    'a': 'age_group',
    'b': 'budget_per_night',
    'c': 'travel_season',
    'f': 'preferred_experiences',
    'g': 'scenery_preferences',
    'h': 'activity_level',
    'i': 'safety_preference',
    'j': 'popularity_preference',
    'r': 'region_interest',
    'rr': 'preferred_regions',
    'yes_swipes': 'liked_destinations',
    'no_swipes': 'disliked_destinations',
    'maybe_swipes': 'maybe_destinations',
    'group_name': 'destination_group'
})

In [None]:
columns_to_keep = [
    'age_group', 'budget_per_night', 'travel_season',
    'preferred_experiences', 'scenery_preferences',
    'activity_level', 'safety_preference', 'popularity_preference',
    'liked_destinations', 'disliked_destinations', 'maybe_destinations'
]

df_cleaned = df[columns_to_keep].copy()

In [None]:
age_map = {0: '0-19', 1: '20-39', 2: '40-59', 3: '60+'}
budget_map = {0: '$0-$49', 1: '$50-$99', 2: '$100-$249', 3: '$300+'}
season_map = {0: 'Winter', 1: 'Spring', 2: 'Summer', 3: 'Fall'}
experience_map = {
    0: 'Beach', 1: 'Adventure', 2: 'Nature', 3: 'Culture',
    4: 'Nightlife', 5: 'History', 6: 'Shopping', 7: 'Cuisine'
}
scenery_map = {
    0: 'Urban', 1: 'Rural', 2: 'Sea', 3: 'Mountain',
    4: 'Lake', 5: 'Desert', 6: 'Plains', 7: 'Jungle'
}
activity_map = {0: 'Chill & Relaxed', 1: 'Balanced', 2: 'Active'}
safety_map = {0: 'Very Safety Conscious', 1: 'Balanced', 2: 'Ready for Anything'}
popularity_map = {0: 'Off the Beaten Path', 1: 'Classic Spot', 2: 'Mainstream & Trendy'}

In [None]:
import ast

def decode_list_field(field, mapping):
    if pd.isna(field): return []
    try:
        parsed = ast.literal_eval(field) if isinstance(field, str) else field
        return [mapping.get(int(val), val) for val in parsed]
    except:
        return []

def decode_single_field(field, mapping):
    try:
        if isinstance(field, float) and pd.isna(field):
            return None

        # If it's a string like "['1']" or "[1]"
        if isinstance(field, str):
            parsed = ast.literal_eval(field)
            if isinstance(parsed, list) and len(parsed) > 0:
                field = parsed[0]
            else:
                field = parsed  # fallback

        return mapping.get(int(field), field)
    except Exception as e:
        print(f"Error decoding single field {field}: {e}")

        return field  # fallback

def decode_region_codes(field):
    if pd.isna(field): return None
    return [specific_region_map.get(code, code) for code in str(field)]


In [None]:
df_cleaned['age_group'] = df_cleaned['age_group'].apply(lambda x: decode_list_field(x, age_map))
df_cleaned['budget_per_night'] = df_cleaned['budget_per_night'].apply(lambda x: decode_single_field(x, budget_map))
df_cleaned['travel_season'] = df_cleaned['travel_season'].apply(lambda x: decode_single_field(x, season_map))
df_cleaned['preferred_experiences'] = df_cleaned['preferred_experiences'].apply(lambda x: decode_list_field(x, experience_map))
df_cleaned['scenery_preferences'] = df_cleaned['scenery_preferences'].apply(lambda x: decode_list_field(x, scenery_map))
df_cleaned['activity_level'] = df_cleaned['activity_level'].apply(lambda x: decode_single_field(x, activity_map))
df_cleaned['safety_preference'] = df_cleaned['safety_preference'].apply(lambda x: decode_single_field(x, safety_map))
df_cleaned['popularity_preference'] = df_cleaned['popularity_preference'].apply(lambda x: decode_single_field(x, popularity_map))


# Formatting destinations


In [None]:
dest_id_map = {}

with open('destination_ids.txt', 'r') as f:
    for idx, line in enumerate(f):
        dest_id_map[idx] = line.strip()

In [None]:
import ast

def decode_swipes(swipe_str, id_map):
    try:
        ids = ast.literal_eval(swipe_str)
        return [id_map.get(i, f"Unknown({i})") for i in ids]
    except:
        return []

df_cleaned['liked_destinations'] = df_cleaned['liked_destinations'].apply(lambda x: decode_swipes(x, dest_id_map))
df_cleaned['disliked_destinations'] = df_cleaned['disliked_destinations'].apply(lambda x: decode_swipes(x, dest_id_map))
df_cleaned['maybe_destinations'] = df_cleaned['maybe_destinations'].apply(lambda x: decode_swipes(x, dest_id_map))

# Step 1: Parse list-like strings to actual lists
def safe_parse_list(val):
    if isinstance(val, list):
        return val
    if isinstance(val, str):
        try:
            parsed = ast.literal_eval(val)
            return parsed if isinstance(parsed, list) else [parsed]
        except:
            return [val]
    if val is None or (not isinstance(val, (list, str)) and pd.isna(val)):
        return []
    return [val]

# Ensure the columns are properly parsed as lists (empty lists or NaN are handled)
df_cleaned['liked_destinations'] = df_cleaned['liked_destinations'].apply(safe_parse_list)
df_cleaned['disliked_destinations'] = df_cleaned['disliked_destinations'].apply(safe_parse_list)

# Remove rows where both 'liked_destinations' and 'disliked_destinations' are empty lists or NaN
df_cleaned = df_cleaned[
    (df_cleaned['liked_destinations'].apply(lambda x: len(x) > 0)) |
    (df_cleaned['disliked_destinations'].apply(lambda x: len(x) > 0))
]


from google.colab import files
df_cleaned.to_csv('cleaned.csv', index=False)
files.download('cleaned.csv')




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Cleaning for Model

In [None]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Example: One-hot encode single-label columns
one_hot_columns = ['budget_per_night', 'travel_season', 'activity_level']
df_encoded = pd.get_dummies(df_cleaned, columns=one_hot_columns)


In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()

# Example: Multi-label encode preferred_experiences
experience_encoded = pd.DataFrame(mlb.fit_transform(df_cleaned['preferred_experiences']),
                                   columns=mlb.classes_,
                                   index=df_cleaned.index)

# Merge with original DataFrame
df_cleaned = pd.concat([df_cleaned.drop('preferred_experiences', axis=1), experience_encoded], axis=1)


In [None]:
df_cleaned = df_cleaned.fillna('Unknown')  # or use imputation or dropna()


In [None]:
multi_label_columns = [
    'liked_destinations',
    'disliked_destinations',
    'maybe_destinations'
]


from sklearn.preprocessing import MultiLabelBinarizer

# For each multi-label column, use MultiLabelBinarizer
for col in multi_label_columns:
    mlb = MultiLabelBinarizer()
    binarized = pd.DataFrame(mlb.fit_transform(df_cleaned[col]),
                             columns=[f"{col}_{cls}" for cls in mlb.classes_])

    # Drop the original column and join the new one
    df_cleaned = df_cleaned.drop(columns=[col]).join(binarized)


In [None]:
def fix_single_category(val):
    if isinstance(val, list):
        return val[0] if val else 'Unknown'
    return val if pd.notna(val) else 'Unknown'

df_cleaned['age_group'] = df_cleaned['age_group'].apply(fix_single_category)

In [None]:
import ast
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, MultiLabelBinarizer
from sklearn.model_selection import train_test_split

# Assuming df_cleaned is your DataFrame

# Step 1: Parse list-like strings to actual lists
def safe_parse_list(val):
    if isinstance(val, list):
        return val
    if isinstance(val, str):
        try:
            parsed = ast.literal_eval(val)
            return parsed if isinstance(parsed, list) else [parsed]
        except:
            return [val]
    if val is None or (not isinstance(val, (list, str)) and pd.isna(val)):
        return []
    return [val]

# List-like columns
multi_label_columns = [
    'scenery_preferences',
    'preferred_regions',
    'liked_destinations',
    'disliked_destinations',
    'maybe_destinations'
]

# Apply safe_parse_list to each multi-label column
for col in multi_label_columns:
    if col in df_cleaned.columns:
        df_cleaned[col] = df_cleaned[col].apply(safe_parse_list)

# Step 2: Clean numeric columns
numeric_cols = ['budget_per_night']  # Update with actual numeric columns

# Function to clean currency or range strings and convert to numeric
def clean_numeric_column(col):
    if col in df_cleaned.columns:
        df_cleaned[col] = df_cleaned[col].replace({r'[^0-9.]': ''}, regex=True)  # Remove non-numeric characters
        df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce')  # Convert to numeric, coercing errors to NaN
    return df_cleaned

# Clean the numeric columns
df_cleaned = clean_numeric_column('budget_per_night')

# Ensure no missing values after cleaning
df_cleaned[numeric_cols] = df_cleaned[numeric_cols].fillna(df_cleaned[numeric_cols].mean())

# Step 3: Scale numeric columns
scaler = MinMaxScaler()
df_cleaned[numeric_cols] = scaler.fit_transform(df_cleaned[numeric_cols])

# Step 4: One-Hot Encode Categorical Columns
categorical_cols = [
    'age_group',
    'travel_season',
    'activity_level',
    'safety_preference',
    'popularity_preference'
]

# One-hot encode categorical columns
df_encoded = pd.get_dummies(df_cleaned, columns=categorical_cols)

# Step 5: Encode multi-label columns using MultiLabelBinarizer
for col in multi_label_columns:
    if col in df_cleaned.columns:
        mlb = MultiLabelBinarizer()
        mlb_result = mlb.fit_transform(df_cleaned[col])
        mlb_df = pd.DataFrame(mlb_result, columns=[f"{col}_{cls}" for cls in mlb.classes_])
        df_encoded = pd.concat([df_encoded, mlb_df], axis=1)

# Step 6: Drop original multi-label columns
df_encoded.drop(columns=[col for col in multi_label_columns if col in df_encoded.columns], errors='ignore', inplace=True)



In [None]:
# Step 7: Define target and features
# 🔁 Replace with your actual target column
target_column = 'liked_destinations_Kingston, Jamaica'
if target_column not in df_encoded.columns:
    raise ValueError(f"Target column '{target_column}' not found in df_encoded")

X = df_encoded.drop(columns=[target_column])
y = df_encoded[target_column]

# Step 8: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the resulting DataFrame after processing
print(df_encoded.head())


   budget_per_night  Adventure  Beach  Cuisine  Culture  History  Nature  \
0          0.050399        1.0    1.0      1.0      0.0      0.0     1.0   
1          0.002505        0.0    1.0      1.0      1.0      1.0     0.0   
2          0.050399        1.0    1.0      1.0      1.0      0.0     0.0   
3          0.050399        0.0    0.0      1.0      0.0      0.0     0.0   
4          0.000000        0.0    0.0      0.0      0.0      0.0     1.0   

   Nightlife  Shopping  liked_destinations_Aachen, Germany  ...  \
0        0.0       0.0                                 0.0  ...   
1        1.0       1.0                                 0.0  ...   
2        1.0       0.0                                 0.0  ...   
3        0.0       0.0                                 0.0  ...   
4        0.0       0.0                                 0.0  ...   

   popularity_preference_Off the Beaten Path  popularity_preference_Unknown  \
0                                      False                 

In [None]:
from google.colab import files
df_cleaned.to_csv('cleaned_preference_data.csv', index=False)
files.download('cleaned_preference_data.csv')

