### Libraries¶

In [17]:
import pandas as pd
import numpy as np
import json

### Helper functions

In [18]:
def get_file_path(base_folder_path,file_name):
    """
    This function takes the parameter base_folder_path and file_name
    Return final path of file
    """
    return base_folder_path + file_name

def get_data_frame_from_csv(file_path):
    """
    This function is used to create and return the data frame from csv file
    parameters: file_path
    return: Panda data frame
    """
    return pd.read_csv(file_path)


### Data load

In [19]:
# Base folder path
base_folder = "../source/"
# Exihibitor file path
exhibitors_path = get_file_path(base_folder,"exhibitors.csv")
# Exhibitor categories path
exhibitors_categories_path = get_file_path(base_folder,"exhibitor_categories.csv")

In [20]:
exhibitors = get_data_frame_from_csv(exhibitors_path) #exhibitors exhibitors_categories
exhibitors_categories = get_data_frame_from_csv(exhibitors_categories_path) # Exihibitor Categories data frame

#### Data cleaning and processing

In [21]:
exhibitors.head()

Unnamed: 0,exhibitorid,Name,MainCategories
0,90556,Turkey Travels,52276|52280|52281
1,92462,Russian Travel Company,52273|52283|52289|52291|52298|52302
2,92491,Indian Travel Company,52273|52274|52281
3,92492,Asia Tourism,52272|52276|52352|52358
4,92493,SriLanka Adventures,52296|52327|52352


#### Onservations
- There are four features
- exhibitorid:- It is unique identifier for exihibitor
- Name:- Exihibitor Name
- MainCategories:- This is the string in which categories are sepearted by |

In [22]:
# Get the null value counts of columns
exhibitors.isna().sum()

exhibitorid       0
Name              0
MainCategories    0
dtype: int64

#### Observations
- There are no column which has null value

In [23]:
exhibitors_categories.head()

Unnamed: 0,categoryId,categoryName
0,52271,1. Accomodation providers
1,52272,1.1 Hotel / Hotel chain / Inn
2,52273,1.2 Apartments / Residential hotel
3,52274,1.3 Hostel / Motel
4,52275,1.4 Boarding house


#### Observations
- Exihibitor Categories has two features
- categoryid:- Uniquely identify the category
- category name:- Some category name has back slash, we need to remove this

In [24]:
# Exihibitor categories null value count
exhibitors_categories.isna().sum()

categoryId      0
categoryName    0
dtype: int64

In [25]:
# Split the main category feature using pipe(|) sepeartor
exhibitors["MainCategories"] = exhibitors["MainCategories"].str.split("|")
# Explode array of Main categories
exhibitors_explode = exhibitors.explode("MainCategories")
# Cast Maincategories to int
exhibitors_explode["MainCategories"] = exhibitors_explode["MainCategories"].astype(int)
exhibitors_explode.head()

Unnamed: 0,exhibitorid,Name,MainCategories
0,90556,Turkey Travels,52276
0,90556,Turkey Travels,52280
0,90556,Turkey Travels,52281
1,92462,Russian Travel Company,52273
1,92462,Russian Travel Company,52283


In [26]:
# Merge exhibitor and exhibitor categories 
exhibitors_final = exhibitors_explode.merge(exhibitors_categories, left_on = "MainCategories",right_on = "categoryId",how="left")
exhibitors_final.head()

Unnamed: 0,exhibitorid,Name,MainCategories,categoryId,categoryName
0,90556,Turkey Travels,52276,52276,1.5 Resort hotel
1,90556,Turkey Travels,52280,52280,2.1 Inbound tour operator
2,90556,Turkey Travels,52281,52281,2.2 Outbound tour operator
3,92462,Russian Travel Company,52273,52273,1.2 Apartments / Residential hotel
4,92462,Russian Travel Company,52283,52283,2.4 Mass market tour operators


In [27]:
# Remove the back slash (/) from category name
exhibitors_final["categoryName"] = exhibitors_final["categoryName"].apply(lambda x: x.replace("/", " "))
exhibitors_final.head()

Unnamed: 0,exhibitorid,Name,MainCategories,categoryId,categoryName
0,90556,Turkey Travels,52276,52276,1.5 Resort hotel
1,90556,Turkey Travels,52280,52280,2.1 Inbound tour operator
2,90556,Turkey Travels,52281,52281,2.2 Outbound tour operator
3,92462,Russian Travel Company,52273,52273,1.2 Apartments Residential hotel
4,92462,Russian Travel Company,52283,52283,2.4 Mass market tour operators


In [28]:
# Drop MainCategories column
exhibitors_final.drop(["MainCategories"],axis = 1,inplace = True)
exhibitors_final.head()

Unnamed: 0,exhibitorid,Name,categoryId,categoryName
0,90556,Turkey Travels,52276,1.5 Resort hotel
1,90556,Turkey Travels,52280,2.1 Inbound tour operator
2,90556,Turkey Travels,52281,2.2 Outbound tour operator
3,92462,Russian Travel Company,52273,1.2 Apartments Residential hotel
4,92462,Russian Travel Company,52283,2.4 Mass market tour operators


In [29]:
# Get final data frame column null value count
exhibitors_final.isna().sum()

exhibitorid     0
Name            0
categoryId      0
categoryName    0
dtype: int64