# Intorduction

For this particular notebook, I'll be using Scikit Learn package for modelling the regression task.

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error

### 

# Loading

In [2]:
train_data = pd.read_csv("Train.csv")
test_data = pd.read_csv("Test.csv")
variables = pd.read_csv("VariableDefinitions.csv")

### 

# Visualization and Cleaning

In [3]:
variables

Unnamed: 0,Column Name,Definition
0,id,Unique identifier for each tourist
1,country,The country a tourist coming from.
2,age_group,The age group of a tourist.
3,travel_with,The relation of people a tourist travel with t...
4,total_female,Total number of females
5,total_male,Total number of males
6,purpose,The purpose of visiting Tanzania
7,main_activity,The main activity of tourism in Tanzania
8,infor_source,The source of information about tourism in Tan...
9,tour_arrangment,The arrangment of visiting Tanzania


In [4]:
train_data.head(10)

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,...,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing,total_cost
0,tour_0,SWIZERLAND,45-64,Friends/Relatives,1.0,1.0,Leisure and Holidays,Wildlife tourism,"Friends, relatives",Independent,...,No,No,No,No,13.0,0.0,Cash,No,Friendly People,674602.5
1,tour_10,UNITED KINGDOM,25-44,,1.0,0.0,Leisure and Holidays,Cultural tourism,others,Independent,...,No,No,No,No,14.0,7.0,Cash,Yes,"Wonderful Country, Landscape, Nature",3214906.5
2,tour_1000,UNITED KINGDOM,25-44,Alone,0.0,1.0,Visiting Friends and Relatives,Cultural tourism,"Friends, relatives",Independent,...,No,No,No,No,1.0,31.0,Cash,No,Excellent Experience,3315000.0
3,tour_1002,UNITED KINGDOM,25-44,Spouse,1.0,1.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,...,Yes,Yes,Yes,No,11.0,0.0,Cash,Yes,Friendly People,7790250.0
4,tour_1004,CHINA,1-24,,1.0,0.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Independent,...,No,No,No,No,7.0,4.0,Cash,Yes,No comments,1657500.0
5,tour_1005,UNITED KINGDOM,25-44,,0.0,1.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,...,No,Yes,Yes,No,9.0,3.0,Cash,Yes,Wildlife,120950.0
6,tour_1007,SOUTH AFRICA,45-64,Alone,0.0,1.0,Business,Mountain climbing,"Friends, relatives",Independent,...,No,No,No,No,9.0,0.0,Cash,Yes,Friendly People,466140.0
7,tour_1008,UNITED STATES OF AMERICA,45-64,Friends/Relatives,1.0,1.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,...,Yes,Yes,Yes,Yes,10.0,3.0,Cash,Yes,Friendly People,3480750.0
8,tour_101,NIGERIA,25-44,Alone,0.0,1.0,Leisure and Holidays,Cultural tourism,"Travel, agent, tour operator",Independent,...,No,No,No,No,4.0,0.0,Cash,Yes,,994500.0
9,tour_1011,INDIA,25-44,Alone,1.0,0.0,Business,Wildlife tourism,"Travel, agent, tour operator",Independent,...,No,No,No,No,5.0,0.0,Credit Card,Yes,Friendly People,2486250.0


In [5]:
test_data.head(2)

Unnamed: 0,ID,country,age_group,travel_with,total_female,total_male,purpose,main_activity,info_source,tour_arrangement,...,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,night_mainland,night_zanzibar,payment_mode,first_trip_tz,most_impressing
0,tour_1,AUSTRALIA,45-64,Spouse,1.0,1.0,Leisure and Holidays,Wildlife tourism,"Travel, agent, tour operator",Package Tour,...,Yes,Yes,Yes,Yes,Yes,10,3,Cash,Yes,Wildlife
1,tour_100,SOUTH AFRICA,25-44,Friends/Relatives,0.0,4.0,Business,Wildlife tourism,Tanzania Mission Abroad,Package Tour,...,No,No,No,No,No,13,0,Cash,No,"Wonderful Country, Landscape, Nature"


In [6]:
# data sizes
train_data.shape, test_data.shape

((4809, 23), (1601, 22))

In [7]:
# determine null values
data = pd.concat([train_data, test_data], sort=False).reset_index(drop=True)
data.isnull().sum()

ID                          0
country                     0
age_group                   0
travel_with              1441
total_female                4
total_male                  7
purpose                     0
main_activity               0
info_source                 0
tour_arrangement            0
package_transport_int       0
package_accomodation        0
package_food                0
package_transport_tz        0
package_sightseeing         0
package_guided_tour         0
package_insurance           0
night_mainland              0
night_zanzibar              0
payment_mode                0
first_trip_tz               0
most_impressing           424
total_cost               1601
dtype: int64

In [8]:
data["travel_with"].fillna("None", inplace=True)
data["total_female"].fillna(data["total_female"].median(), inplace=True)
data["total_male"].fillna(data["total_male"].median(), inplace=True)

# drop_list = ["info_source", "most_impressing", "package_transport_int", "package_accomodation", "package_food",
#             "package_transport_tz", "package_sightseeing", "package_guided_tour", "package_guided_tour", "package_sightseeing",
#             "package_insurance"]

# data = data.drop(labels=drop_list, axis=1)

In [9]:
data.isnull().sum()

ID                          0
country                     0
age_group                   0
travel_with                 0
total_female                0
total_male                  0
purpose                     0
main_activity               0
info_source                 0
tour_arrangement            0
package_transport_int       0
package_accomodation        0
package_food                0
package_transport_tz        0
package_sightseeing         0
package_guided_tour         0
package_insurance           0
night_mainland              0
night_zanzibar              0
payment_mode                0
first_trip_tz               0
most_impressing           424
total_cost               1601
dtype: int64

In [10]:
le = LabelEncoder()

data['age_group'] = le.fit_transform(data['age_group'])
data['package_transport_int'] = le.fit_transform(data['package_transport_int'])
data['package_accomodation'] = le.fit_transform(data['package_accomodation'])
data['package_food'] = le.fit_transform(data['package_food'])
data['package_transport_tz'] = le.fit_transform(data['package_transport_tz'])
data['package_sightseeing'] = le.fit_transform(data['package_sightseeing'])
data['package_guided_tour'] = le.fit_transform(data['package_guided_tour'])
data['package_insurance'] = le.fit_transform(data['package_insurance'])
data['first_trip_tz'] = le.fit_transform(data['first_trip_tz'])
data['country'] = le.fit_transform(data['country'])

In [11]:
columns_to_transform =  ['tour_arrangement','travel_with','purpose','main_activity','info_source','most_impressing','payment_mode']
data = pd.get_dummies( data, columns = columns_to_transform, drop_first = True)

In [12]:
## convert float dtypes to int
data["total_female"] = data['total_female'].astype('int')
data["total_male"] = data['total_male'].astype('int')
data["night_mainland"] = data['night_mainland'].astype('int')
data["night_zanzibar"] = data['night_zanzibar'].astype('int')

In [13]:
# feature engineering
data["total_persons"] = data["total_female"] + data["total_male"]
data["total_nights_spent"] = data["night_mainland"] + data["night_zanzibar"]

In [14]:
drop_list = ["total_female", "total_male", "night_mainland", "night_zanzibar"]
data = data.drop(drop_list, axis=1)

In [15]:
data.corr()

Unnamed: 0,country,age_group,package_transport_int,package_accomodation,package_food,package_transport_tz,package_sightseeing,package_guided_tour,package_insurance,first_trip_tz,...,most_impressing_Friendly People,most_impressing_Good service,most_impressing_No comments,most_impressing_Satisfies and Hope Come Back,"most_impressing_Wonderful Country, Landscape, Nature",payment_mode_Credit Card,payment_mode_Other,payment_mode_Travellers Cheque,total_persons,total_nights_spent
country,1.0,-0.00119,-0.13743,-0.122153,-0.11128,-0.095933,-0.059152,-0.087682,-0.08631,-0.107554,...,0.054493,0.001182,-0.006399,0.0058,-0.05142,-0.014538,-0.013994,-0.002723,-0.048549,-0.059733
age_group,-0.00119,1.0,0.177475,0.170416,0.161986,0.182584,0.190039,0.174089,0.105873,-0.044355,...,0.043239,-0.066067,-0.027234,0.028993,-0.045919,0.143437,0.020322,0.011682,0.068387,-0.124487
package_transport_int,-0.13743,0.177475,1.0,0.673831,0.624054,0.59127,0.416871,0.42423,0.4727,0.265946,...,-0.053244,-0.004379,-0.027798,0.000485,-0.001948,0.055507,0.007813,0.005388,0.163646,-0.026139
package_accomodation,-0.122153,0.170416,0.673831,1.0,0.923149,0.851816,0.711443,0.725059,0.44652,0.38611,...,-0.06377,0.005641,-0.092282,-0.003281,0.026191,0.026702,0.007391,-0.005804,0.1451,-0.029605
package_food,-0.11128,0.161986,0.624054,0.923149,1.0,0.824724,0.672716,0.700402,0.446935,0.371156,...,-0.065679,0.004603,-0.084786,0.001137,0.023235,0.024596,0.002991,-0.012765,0.125753,-0.03816
package_transport_tz,-0.095933,0.182584,0.59127,0.851816,0.824724,1.0,0.695979,0.697685,0.438216,0.355017,...,-0.047394,0.000344,-0.102809,-0.013398,0.029053,0.019647,0.006491,-0.010232,0.12713,-0.02783
package_sightseeing,-0.059152,0.190039,0.416871,0.711443,0.672716,0.695979,1.0,0.790568,0.378617,0.316334,...,-0.05507,0.004045,-0.120742,-0.008761,0.013236,0.017525,-4.1e-05,0.005121,0.116173,-0.00739
package_guided_tour,-0.087682,0.174089,0.42423,0.725059,0.700402,0.697685,0.790568,1.0,0.431917,0.325755,...,-0.060577,0.015182,-0.098899,-0.014889,0.017808,0.019779,-0.023452,0.013684,0.108819,-0.022938
package_insurance,-0.08631,0.105873,0.4727,0.44652,0.446935,0.438216,0.378617,0.431917,1.0,0.204417,...,-0.022841,-0.02759,-0.014795,-0.010557,0.020123,0.039326,0.000376,0.022138,0.082127,-0.017504
first_trip_tz,-0.107554,-0.044355,0.265946,0.38611,0.371156,0.355017,0.316334,0.325755,0.204417,1.0,...,-0.050768,0.004147,-0.078007,0.010933,0.074137,-0.020899,-0.006073,-0.004035,0.110222,0.048642


### 

# Splitting 

In [16]:
train_data = data[data["total_cost"].notnull()].reset_index(drop=True)
test_data = data[data["total_cost"].isna()].reset_index(drop=True)

In [17]:
train_data.shape, test_data.shape

((4809, 50), (1601, 50))

In [18]:
X = train_data.drop(["ID", "total_cost"], axis=1)
y = train_data["total_cost"]

In [19]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.25)
print(Xtrain.shape, ytrain.shape)
print(Xtest.shape, ytest.shape)

(3606, 48) (3606,)
(1203, 48) (1203,)


### 

# Modelling 

Model selection, training, and evaluation.

In [20]:
from sklearn.neighbors import KNeighborsRegressor

In [21]:
model = KNeighborsRegressor(n_neighbors=35, algorithm='ball_tree', leaf_size=50, p=1)
model.fit(Xtrain, ytrain)

KNeighborsRegressor(algorithm='ball_tree', leaf_size=50, n_neighbors=35, p=1)

In [22]:
predictions = model.predict(Xtest)

In [23]:
print(f"Mean Absolute Error {mean_absolute_error(ytest, predictions)}")

Mean Absolute Error 4906402.836385418


In [26]:
from xgboost import XGBRegressor

In [27]:
model = XGBRegressor(n_estimators=200, learning_rate=0.02, max_depth=5)
model.fit(Xtrain, ytrain)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.02, max_delta_step=0, max_depth=5,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=200, n_jobs=4, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [28]:
predictions = model.predict(Xtest)

In [29]:
print(f"Mean Absolute Error {mean_absolute_error(ytest, predictions)}")

Mean Absolute Error 4673251.295845802


### 

# Submission 

In [30]:
test = test_data.drop(["ID", "total_cost"], axis=1)
preds = model.predict(test)
submission = pd.DataFrame({"ID": test_data["ID"], "total_cost": preds})

In [31]:
submission.to_csv("Submission.csv", index=False)