#                                         Negassi Tesfay
##                                                 Airbnb Data Challenge


## Predictive Modeling

In [1]:
#!pip install vaderSentiment
#!pip install textblob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.model_selection import train_test_split


from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
from math import sqrt

import warnings
warnings.filterwarnings('ignore')

In [2]:
!ls

airbnb_data_wrangling.html
airbnb_data_wrangling.ipynb
airbnb_eda.html
airbnb_eda.ipynb
aribnb_model_Regreassion.ipynb
aribnb_model_classification.html
aribnb_model_classification.ipynb
clean_listings.csv
listings_ab.csv
requirements.yaml


### Load Data <br>

- Cleaned data from data wrangling will be loaded <br>
- Raw data will be loaded to bring the text/comment column we removed. This will be used for NLP processing/sentimental analysis.



In [3]:
df_clean = pd.read_csv('clean_listings.csv',low_memory=False)
# only description column for sentimental analysis
df_senti=pd.read_csv('listings_ab.csv',low_memory=False)[['id','description']] 

### Merge the two data sets with  Left join (choose rows only with the cleaned data)

In [4]:
df_all=pd.merge(left=df_clean, right=df_senti, how='left', left_on='id', right_on='id')
df_all.shape

(12679, 47)

### Convert the comment column into sentimental value

In [5]:
analyser = SentimentIntensityAnalyzer()

def sentiment_analyzer_scores(sentence):
    try:
        score = analyser.polarity_scores(sentence)
        return score['compound']
    except:
        return None
    

sentiment= df_all.description.apply(lambda x:sentiment_analyzer_scores(x))

df_all['sentiment']=sentiment

### Split Columns <br>

Columns were splitted into:

- Categorical and numeric for imputation purpose(if there are any) and be used as feature variables for modeling.
- Id column
- description (to be removed)
- response varables (change the variable response to price or availabilty to predict respectively)


In [6]:
response = 'price'
id_var = 'id'
sent_var='description'
categoricalVariables = []
numericVariables=[]

for col in df_all.columns:
    if (col not in [id_var, sent_var,response]):
        if df_all[col].dtypes == 'object':
            categoricalVariables.append(col)
        else:
            numericVariables.append(col)


### Pipeline to impute missing values, and encode categrorical variables

In [7]:
num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),  
    ])



cat_pipeline = Pipeline([ 
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ("ohe", OneHotEncoder(sparse=False))
                        ])


full_pipeline = ColumnTransformer([
        ("num", num_pipeline, numericVariables),
        ("cat", cat_pipeline, categoricalVariables),
        
    ])

df_model = full_pipeline.fit_transform(df_all[numericVariables + categoricalVariables])


In [8]:
print(df_all.shape)
print(df_model.shape)

(12679, 48)
(12679, 156)


### Split Train and Test datasets 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(df_model, 
                                                    df_all[response], test_size=0.2, random_state=1234)

In [10]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(10143, 156)
(10143,)
(2536, 156)
(2536,)


### Model performance Metrics <br>
Model performance will be evaluated with Root Mean Squared Error (RMSE) and the following function calculates RMSE and will set it as choice of metrics. Less RMSE will be desired.

In [11]:
def my_rmse(y_actual, y_predicted):
    return sqrt(mean_squared_error(y_actual, y_predicted))

my_scorer = make_scorer(my_rmse, greater_is_better=False)

### Candidate models <br>

The following models will be tried and for prediction.<br>
- Linear Regression
- Decision Tree Regression
- Random Forest
- Gradiant Boost Regressro
- Adaboost Regressor.



In [12]:
lin_reg = LinearRegression()
tree_reg = DecisionTreeRegressor(random_state=42)
forest_reg = RandomForestRegressor( random_state=42)
gbm = GradientBoostingRegressor()
adaBoost = AdaBoostRegressor(random_state=42, n_estimators=1)

In [13]:
test_reg = LinearRegression()
test_reg.fit(X_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [14]:
pred=test_reg.predict(X_test)
pred[0:10]

array([154.25153562, 220.90231534,  35.75098103, 176.7032797 ,
       114.80182859, 186.83261319, 146.6718237 , 180.40326139,
       148.02728482, 166.06463071])

### GridSearch /Hyper parametrization<br>
To fine tune our model, we will iterate through differtnt values of the parameters of their respective models. The following is the parameter grid of the differtn models ind dictionary data structure.



In [15]:
param_grid = {
    'lin_reg' :{
        'normalize' : [True,False ]
    },
    
    'tree_reg' :{
        'criterion': ['mse',"friedman_mse"],
        'max_features': ["auto", "sqrt", "log2"], 
    },
    
    'forest_reg' :{
        'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8],
        'bootstrap': [False],  
    },
    'gbm' :{
        'n_estimators': [3, 10, 30], 
      'max_features': [2, 3, 4]
    },
    
    'ada' :{
        'n_estimators': [3, 10, 30], 
      
    }
    
    
   }



In [16]:
models = {"gbm": gbm, 
          "forest_reg": forest_reg ,
          "tree_reg":tree_reg,
           "lin_reg":lin_reg,
         "ada" : adaBoost
         }
best_models = {}
for m in models.keys():
    print('%s loading ...' % m)
    grid = GridSearchCV(models[m], param_grid[m], cv=10, scoring=my_scorer)
    grid.fit(X_train, y_train)
    best_models[m] = grid

gbm loading ...
forest_reg loading ...
tree_reg loading ...
lin_reg loading ...
ada loading ...


The above procedure tunes the hyper parameters of each learners and put the resulting models in a dictionary. What follows is comparison of different learnes to choose a champion model. For this purpose, I will use RMSE as a metrics.

In [17]:
print ('%20s %20s %20s' % ('Model','Train','Test'))
for m in best_models.keys():
    m_ = best_models[m]
    rmse_train = np.abs(m_.best_score_)
    test_pred = m_.predict(X_test)
    rmse_test = my_rmse(test_pred, y_test)
    #print(m,rmse_train, rmse_test)
    print ('%20s %20s %20s' % (m, rmse_train, rmse_test))

               Model                Train                 Test
                 gbm    75.15987512697652    72.44260554609002
          forest_reg    64.40527507562089    61.85268193956737
            tree_reg    88.99704449728222    87.57088780170353
             lin_reg   225440.26058569085     92864.2477784006
                 ada    75.13746385587531    71.92250680116622


The random forest model performed better than the other models as it predicted the test data with a the least root mean squared error. Hence I would choose this model.