#                                         Negassi Tesfay
##                                                 Airbnb Data Challenge


## Predictive Modeling - Classification

In [1]:
#!pip install vaderSentiment
#!pip install textblob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import GridSearchCV
# from sklearn.metrics import make_scorer
# from sklearn.metrics import mean_squared_error
# from math import sqrt
import warnings
warnings.filterwarnings('ignore')

In [2]:
!ls

airbnb_data_wrangling.html
airbnb_data_wrangling.ipynb
airbnb_eda.html
airbnb_eda.ipynb
aribnb_model_Regreassion.ipynb
aribnb_model_classification.ipynb
clean_listings.csv
listings_ab.csv
requirements.yaml


### Load Data <br>

- Cleaned data from data wrangling will be loaded <br>
- Raw data will be loaded to bring the text/comment column we removed. This will be used for NLP processing/sentimental analysis.
- Convert availability to binary(available=1)



In [3]:
df_clean = pd.read_csv('clean_listings.csv',low_memory=False)
# only description column for sentimental analysis
df_senti=pd.read_csv('listings_ab.csv',low_memory=False)[['id','description']] 

In [4]:
df_clean['availability_30']=df_clean['availability_30'].apply(lambda x: 0 if x==0 else 1)

### Merge the two data sets with  Left join (choose rows only with the cleaned data)

In [5]:
df_all=pd.merge(left=df_clean, right=df_senti, how='left', left_on='id', right_on='id')
df_all.shape

(12679, 47)

### Convert the comment column into sentimental value

In [6]:
analyser = SentimentIntensityAnalyzer()

def sentiment_analyzer_scores(sentence):
    try:
        score = analyser.polarity_scores(sentence)
        return score['compound']
    except:
        return None
    

sentiment= df_all.description.apply(lambda x:sentiment_analyzer_scores(x))

df_all['sentiment']=sentiment

### Split Columns <br>

Columns were splitted into:

- Categorical and numeric for imputation purpose(if there are any) and be used as feature variables for modeling.
- Id column
- description (to be removed)
- response varables ('availability_30')


In [7]:
response = 'availability_30'
id_var = 'id'
sent_var='description'
categoricalVariables = []
numericVariables=[]

for col in df_all.columns:
    if (col not in [id_var, sent_var,response]):
        if df_all[col].dtypes == 'object':
            categoricalVariables.append(col)
        else:
            numericVariables.append(col)


### Pipeline to impute missing values, and encode categrorical variables

In [8]:
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),  
    ])



cat_pipeline = Pipeline([ 
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ("ohe", OneHotEncoder(sparse=False))
                        ])


full_pipeline = ColumnTransformer([
        ("num", num_pipeline, numericVariables),
        ("cat", cat_pipeline, categoricalVariables),
        
    ])

df_model = full_pipeline.fit_transform(df_all[numericVariables + categoricalVariables])


In [9]:
print(df_all.shape)
print(df_model.shape)

(12679, 48)
(12679, 156)


### Split Train and Test datasets 

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df_model, 
                                                    df_all[response], test_size=0.2, random_state=1234)

In [11]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(10143, 156)
(10143,)
(2536, 156)
(2536,)


### Candidate models <br>

The following models will be tried and for prediction.<br>

- Decision Tree Classifier
- Random Forest Classifier
- Gradient Boost Classifier
- Adaboost Classifier



### GridSearch /Hyper parametrization tunning<br>
To fine tune our model, we will iterate through differtnt values of the parameters of their respective models. The following is the parameter grid of the differtn models ind dictionary data structure.



In [12]:
param_grid = {   
    'tree_clf' :{
        'criterion': ["gini","entropy"],
        'max_features': ["auto", "sqrt", "log2"], 
    },
    
    'forest_clf' :{
        'n_estimators': [3, 10, 30], 'max_features': ["auto", "sqrt", "log2"],
        'bootstrap': [False,True],  
    },
    'gbm' :{
        'n_estimators': [3, 10, 30], 
      'max_features': [2,3,4]
    },
    
    'ada' :{
        'n_estimators': [10,50,100], 
      
    }
    
    
   }



In [13]:
tree_clf = DecisionTreeClassifier()
forest_clf = RandomForestClassifier()
gbm = GradientBoostingClassifier()
adaBoost = AdaBoostClassifier()

In [16]:
models = {"gbm": gbm, 
          "forest_clf": forest_clf ,
          "tree_clf":tree_clf,
           "ada" : adaBoost
         }
best_models = {}
for m in models.keys():
    print('%s loading ...' % m)
    grid = GridSearchCV(models[m], param_grid[m], cv=5)
    grid.fit(X_train, y_train)
    best_models[m] = grid
    

gbm loading ...
forest_clf loading ...
tree_clf loading ...
ada loading ...


The above procedure tunes the hyper parameters of each learners and put the resulting models in a dictionary. What follows is comparison of different learnes to choose a champion model. For this purpose, I will use accuracy as a metrics.

In [18]:
print ('%20s %20s %20s ' % ('Model','Train','Test'))
for m in best_models.keys():
    m_ = best_models[m]
    m_.fit(X_train,y_train)
    score_test = m_.score(X_test,y_test)
    score_train = m_.score(X_train,y_train)
    #print(m,rmse_train, rmse_test)
    print ('%20s %20s %20s' % (m, round(score_train,3),round(score_test,3)))

               Model                Train                 Test 
                 gbm                0.711                0.705
          forest_clf                  1.0                0.772
            tree_clf                  1.0                0.689
                 ada                0.779                0.777


Even though forest_clf looks to have one of the highest accuracy, it overfits and as a result doesn't generalise well. Consequently, I will choose ada boost as the champion model.