In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, OneHotEncoder
pd.set_option('display.max_columns', None)


In [31]:
df = pd.read_csv("City.csv")
df.head()

Unnamed: 0,City,Region,State,Tourist Attractions,Best Time to Visit,Tourist Season,Accommodation Options,Cultural Influence,Cuisine,Special Events,Type of Destination,Budget,Ratings,City Description
0,Manali,North,Himachal Pradesh,"Solang Valley, Rohtang Pass, Hadimba Temple",Oct - June,"Spring, Summer, Fall, Winter","Hotels, Resorts, Guesthouses","Himachali, Tibetan","Himachali, Tibetan",Winter Carnival,Hill Station,Medium,4.5,[' One of the most popular hill stations in Hi...
1,Leh Ladakh,North,Ladakh,"Pangong Lake, Nubra Valley, Thiksey Monastery",May - Sept,Summer,"Hotels, Guesthouses, Homestays","Ladakhi, Tibetan","Ladakhi, Tibetan",Hemis Festival,"Adventure, Scenic",High,4.6,"["" Ladakh is a union territory in the Kashmir ..."
2,Coorg,South,Karnataka,"Abbey Falls, Talakaveri, Coffee Plantations",Oct - Mar,"Fall, Winter, Spring","Resorts, Homestays, Guesthouses",Kodava,"Coorgi, South Indian",Puthari Festival,Hill Station,Medium,4.2,[' Located amidst imposing mountains in Karnat...
3,Andaman,East,Andaman & Nicobar,"Radhanagar Beach, Cellular Jail, Scuba Diving",Oct - May,"Winter, Spring","Resorts, Hotels, Guesthouses","Tribal, Colonial","Seafood, Bengali",Island Tourism Festival,"Beach, Island",High,4.5,[' Replete with turquoise blue water beaches a...
4,Lakshadweep,South,Lakshadweep,"Agatti Island, Bangaram Island, Marine Museum",Oct - May,"Winter, Spring","Resorts, Hotels, Guesthouses","Tribal, Islamic","Seafood, South Indian",Marine Awareness Programs,"Beach, Island",High,4.0,"["" Formerly known as Laccadive Islands, Laksha..."


In [32]:
df['Cuisine'].str.split(', ').explode().unique()

array(['Himachali', 'Tibetan', 'Ladakhi', 'Coorgi', 'South Indian',
       'Seafood', 'Bengali', 'Goan', 'Rajasthani', 'Kashmiri',
       'Sikkimese', 'Kerala Cuisine', 'North Indian', 'Vegetarian',
       'Nepali', 'Maharashtrian', 'Madhya Pradeshi', 'Diverse', 'Mughlai',
       'Cosmopolitan', 'Punjabi', 'Hyderabadi', 'French', 'Tamil',
       'Gujarati', 'Bihari', 'Andhra Cuisine', 'Israeli', 'Awadhi',
       'Assamese', 'Tripuri', 'Odia', 'Dogri', 'North Eastern'],
      dtype=object)

In [33]:
df = df.drop(columns=['State', 'Best Time to Visit', 'Cuisine', 'Special Events', 'Cultural Influence', 'Tourist Attractions', 'Special Events', 'City Description'])

In [34]:
df.head()

Unnamed: 0,City,Region,Tourist Season,Accommodation Options,Type of Destination,Budget,Ratings
0,Manali,North,"Spring, Summer, Fall, Winter","Hotels, Resorts, Guesthouses",Hill Station,Medium,4.5
1,Leh Ladakh,North,Summer,"Hotels, Guesthouses, Homestays","Adventure, Scenic",High,4.6
2,Coorg,South,"Fall, Winter, Spring","Resorts, Homestays, Guesthouses",Hill Station,Medium,4.2
3,Andaman,East,"Winter, Spring","Resorts, Hotels, Guesthouses","Beach, Island",High,4.5
4,Lakshadweep,South,"Winter, Spring","Resorts, Hotels, Guesthouses","Beach, Island",High,4.0


In [35]:
# Split the multi-value columns into lists
df['Tourist Season'] = df['Tourist Season'].str.split(', ')
df['Accommodation Options'] = df['Accommodation Options'].str.split(', ')
df['Type of Destination'] = df['Type of Destination'].str.split(', ')

# Apply one-hot encoding using MultiLabelBinarizer
mlb = MultiLabelBinarizer()

df_tourist_season = pd.DataFrame(mlb.fit_transform(df['Tourist Season']), columns=mlb.classes_).add_prefix('TouristSeason_').astype(int)
df_accommodation_options = pd.DataFrame(mlb.fit_transform(df['Accommodation Options']), columns=mlb.classes_).add_prefix('Accommodation_').astype(int)
df_type_of_destination = pd.DataFrame(mlb.fit_transform(df['Type of Destination']), columns=mlb.classes_).add_prefix('DestinationType_').astype(int)


In [36]:
# OneHotEncoder for single-valued categorical columns
ohe_region = OneHotEncoder(sparse=False)
ohe_budget = OneHotEncoder(sparse=False)

region_encoded = ohe_region.fit_transform(df[['Region']]).astype(int)
budget_encoded = ohe_budget.fit_transform(df[['Budget']]).astype(int)

# Create DataFrames from the encoded arrays
df_region_encoded = pd.DataFrame(region_encoded, columns=ohe_region.get_feature_names_out(), index=df.index)
df_budget_encoded = pd.DataFrame(budget_encoded, columns=ohe_budget.get_feature_names_out(), index=df.index)



In [37]:
df_encoded = pd.concat([
    df.drop(columns=['Region', 'Tourist Season', 'Accommodation Options', 'Type of Destination', 'Budget']),
    df_tourist_season, df_accommodation_options, df_type_of_destination,
    df_region_encoded, df_budget_encoded
], axis=1)

In [39]:
df_encoded.head()

Unnamed: 0,City,Ratings,TouristSeason_Fall,TouristSeason_Spring,TouristSeason_Summer,TouristSeason_Winter,Accommodation_Ashrams,Accommodation_Camps,Accommodation_Guesthouses,Accommodation_Heritage Hotels,Accommodation_Homestays,Accommodation_Hotels,Accommodation_Houseboats,Accommodation_Monasteries,Accommodation_Resorts,DestinationType_Adventure,DestinationType_Backwaters,DestinationType_Beach,DestinationType_Coastal,DestinationType_Cultural,DestinationType_Desert,DestinationType_Heritage,DestinationType_Hill Station,DestinationType_Island,DestinationType_Nightlife,DestinationType_Pilgrimage,DestinationType_Romantic,DestinationType_Scenic,DestinationType_Skiing,DestinationType_Trekking,DestinationType_Urban,DestinationType_Wildlife,Region_Center,Region_East,Region_North,Region_South,Region_West,Budget_High,Budget_Low,Budget_Medium
0,Manali,4.5,1,1,1,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
1,Leh Ladakh,4.6,0,0,1,0,0,0,1,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0
2,Coorg,4.2,1,1,0,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
3,Andaman,4.5,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0
4,Lakshadweep,4.0,0,1,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0


In [41]:
df_encoded.to_csv("City_encoded.csv", sep=',', encoding='utf-8', index=False)