In [1]:
# Load libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

In [2]:
train_x = pd.read_csv('./airbnb_train_x.csv')
train_y = pd.read_csv('./airbnb_train_y.csv')
test_x = pd.read_csv('./airbnb_test_x.csv')

In [3]:
# Join the training y to the training x file
# Also turn the target variables into categorical
train = pd.concat([train_x, train_y], axis=1)
train['perfect_rating_score'] = train['perfect_rating_score'].astype('category')
train['high_booking_rate'] = train['high_booking_rate'].astype('category')

print(train.info())
train.describe(include='all')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92067 entries, 0 to 92066
Data columns (total 63 columns):
 #   Column                     Non-Null Count  Dtype   
---  ------                     --------------  -----   
 0   name                       92048 non-null  object  
 1   summary                    88816 non-null  object  
 2   space                      72442 non-null  object  
 3   description                92048 non-null  object  
 4   experiences_offered        92067 non-null  object  
 5   neighborhood_overview      62428 non-null  object  
 6   notes                      45747 non-null  object  
 7   transit                    64335 non-null  object  
 8   access                     58404 non-null  object  
 9   interaction                56661 non-null  object  
 10  house_rules                64234 non-null  object  
 11  host_name                  91898 non-null  object  
 12  host_since                 91898 non-null  object  
 13  host_location              9165

Unnamed: 0,name,summary,space,description,experiences_offered,neighborhood_overview,notes,transit,access,interaction,...,availability_60,availability_90,availability_365,first_review,license,jurisdiction_names,cancellation_policy,features,high_booking_rate,perfect_rating_score
count,92048,88816,72442,92048,92067,62428,45747,64335,58404,56661,...,92067.0,92067.0,92067.0,92067,8148,45081,92067,92041,92067,92067
unique,90781,85231,69691,90398,1,56360,41178,58282,52677,50267,...,,,,2566,2877,25,6,119,2,2
top,Private room,"My place is good for couples, solo adventurers...",Every booking is instantly confirmed. Every ca...,"My place is good for couples, solo adventurers...",none,La Jolla California is an enchanting seaside c...,All of our rentals are fully licensed and regu...,Convenient public transportation. The location...,All of our properties utilize our streamlined ...,"Our concierge is available by phone, email, or...",...,,,,2017-01-01,City registration pending,"City of Los Angeles, CA",strict,"Host Has Profile Pic,Host Identity Verified,Is...",NO,NO
freq,12,64,68,36,92067,42,149,134,94,72,...,,,,449,4621,14356,43312,29807,73405,65620
mean,,,,,,,,,,,...,24.783516,41.474405,179.69141,,,,,,,
std,,,,,,,,,,,...,21.320198,32.314087,137.535477,,,,,,,
min,,,,,,,,,,,...,0.0,0.0,0.0,,,,,,,
25%,,,,,,,,,,,...,2.0,7.0,40.0,,,,,,,
50%,,,,,,,,,,,...,22.0,42.0,169.0,,,,,,,
75%,,,,,,,,,,,...,44.0,71.0,322.0,,,,,,,


In [4]:
# EXAMPLE PREDICTIONS FOR CONTEST 1

# drop training data with mising label
orig_size = train.shape[0]
train_perfect = train[~train['perfect_rating_score'].isna()]
print(f'Dropped {orig_size - train_perfect.shape[0]} rows with missing label.')

# Create a simple model to predict perfect_rating_score and generate predictions in the test data
X_perfect = train_perfect[['accommodates']]

y_perfect = train_perfect['perfect_rating_score'].map({'NO':0, 'YES':1})
# Convert YES/NO to 1/0
# fill missing values with average accomodates
mean_accom = np.mean(X_perfect['accommodates'])
X_perfect['accommodates'] = X_perfect['accommodates'].fillna(mean_accom)
test_x['accommodates'] = test_x['accommodates'].fillna(mean_accom)


model_perfect = LogisticRegression()
model_perfect.fit(X_perfect, y_perfect)
probs_perfect = model_perfect.predict_proba(test_x[['accommodates']])[:, 1]

# Make binary classifications (make sure to check for NAs!)
classifications_perfect = np.where(probs_perfect > 0.29, "YES", "NO")
classifications_perfect = pd.Series(classifications_perfect).rename('x')
assert not pd.isnull(classifications_perfect).any()
# Make sure you have prediction for all rows in the test data
assert classifications_perfect.shape[0] == test_x.shape[0]
print(classifications_perfect.value_counts())
print('*'*80)
classifications_perfect

Dropped 0 rows with missing label.
x
NO     5003
YES    4997
Name: count, dtype: int64
********************************************************************************


0        NO
1        NO
2       YES
3       YES
4       YES
       ... 
9995    YES
9996    YES
9997    YES
9998     NO
9999    YES
Name: x, Length: 10000, dtype: object

In [7]:
# EXAMPLE PREDICTIONS FOR CONTEST 2

# drop training data with mising label
orig_size = train.shape[0]
train_rate = train[~train['high_booking_rate'].isna()]
print(f'Dropped {orig_size - train_rate.shape[0]} rows with missing label.')

# Create a simple model to predict high_booking_rate and generate predictions in the test data
X_rate = train_rate[['accommodates']]
# Convert YES/NO to 1/0
y_rate = train_rate['high_booking_rate'].map({'NO':0, 'YES':1})

# fill missing values with average accomodates
mean_accom = np.mean(X_rate['accommodates'])
X_rate['accommodates'] = X_rate['accommodates'].fillna(mean_accom)
test_x['accommodates'] = test_x['accommodates'].fillna(mean_accom)

model_rate = LogisticRegression()
model_rate.fit(X_rate, y_rate)
probs_rate = model_rate.predict_proba(test_x[['accommodates']])[:, 1]

probs_rate = pd.Series(probs_rate).rename('x')
assert not pd.isnull(probs_rate).any()
assert probs_rate.shape[0] == test_x.shape[0]
print('*'*80)
probs_rate

Dropped 0 rows with missing label.
********************************************************************************


0       0.207281
1       0.201856
2       0.200072
3       0.198299
4       0.200072
          ...   
9995    0.198299
9996    0.198299
9997    0.200072
9998    0.210957
9999    0.200072
Name: x, Length: 10000, dtype: float64

In [8]:
# Output your predictions
# They must be in EXACTLY this format:
# A .csv file with the naming convention targetvariable_groupAAA.csv,
# where you replace targetvariable with your chosen target, and AAA with your group name
# In exactly the same order as they are in the test_x file

# For perfect_rating_score, each row should be a binary YES (is perfect) or NO (not perfect)
# For high_booking_rate, each row should be a number representing the probability of high_booking_rate = YES

# This code creates sample outputs in the correct format. Ensure the first row is the header with column name x

classifications_perfect.to_csv("./perfect_rating_score_group0.csv", index=False, header=True)
probs_rate.to_csv("high_booking_rate_group0.csv", index=False, header=True)

# I have evaluated these predictions against the test set
# the above perfect_rating_score predictions have TPR = 0.5244 and FPR = 0.4894 (so they would be disqualified!)
# the above high_booking_rate predictions have AUC = 0.525