In [85]:
import nltk
import string
import pandas as pd
from sklearn.svm import SVC
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report , confusion_matrix

In [65]:
#Loading The Dataset
data = pd.read_csv("Abnb_paris.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,reviews,savwish,logreviews,logsavwish,price,rating,accommodates,extpeop,min_stay,sentiment,secdep,cleanfee,weekfee,monthfee,bedroom,bathroom,beds,review_text
0,1,10,1277,1.041393,3.106531,185,4.5,3,1,5,3.208943,1,1,1,1,1,1,1,Even if i stayed at another apartment the stay...
1,2,8,279,0.954243,2.447158,100,5.0,2,0,3,2.245883,0,0,0,0,1,1,1,"Exactly what you can see on the pictures, plus..."
2,3,31,434,1.50515,2.638489,512,5.0,10,1,3,2.509137,1,0,1,1,4,3,5,We were five colleagues staying in Lea's apart...
3,4,65,784,1.819544,2.89487,92,4.5,2,0,2,2.864322,0,0,1,1,1,1,1,"It was perfect! The location is great, easy to..."
4,5,9,621,1.0,2.79379,71,4.5,4,1,1,1.892305,1,1,0,0,2,1,1,"Saras flat is nice and clean, she was the perf..."


**Data Pre-Processing**

In [57]:
#Null Values
data.isna().sum()

Unnamed: 0      0
reviews         0
savwish         0
logreviews      0
logsavwish      0
price           0
rating          0
accommodates    0
extpeop         0
min_stay        0
sentiment       0
secdep          0
cleanfee        0
weekfee         0
monthfee        0
bedroom         0
bathroom        0
beds            0
review_text     0
dtype: int64

In [66]:
def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [token for token in tokens if token not in string.punctuation]  
    tokens = [token for token in tokens if token not in stopwords.words('english')]  

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

data['review_text'] = data['review_text'].apply(preprocess_text)

In [67]:
data.head()

Unnamed: 0.1,Unnamed: 0,reviews,savwish,logreviews,logsavwish,price,rating,accommodates,extpeop,min_stay,sentiment,secdep,cleanfee,weekfee,monthfee,bedroom,bathroom,beds,review_text
0,1,10,1277,1.041393,3.106531,185,4.5,3,1,5,3.208943,1,1,1,1,1,1,1,even stayed another apartment stay perfect nic...
1,2,8,279,0.954243,2.447158,100,5.0,2,0,3,2.245883,0,0,0,0,1,1,1,exactly see picture plus tiny worthwhile balco...
2,3,31,434,1.50515,2.638489,512,5.0,10,1,3,2.509137,1,0,1,1,4,3,5,five colleague staying lea 's apartment traini...
3,4,65,784,1.819544,2.89487,92,4.5,2,0,2,2.864322,0,0,1,1,1,1,1,perfect location great easy find eric helpful ...
4,5,9,621,1.0,2.79379,71,4.5,4,1,1,1.892305,1,1,0,0,2,1,1,saras flat nice clean perfect host perfect pla...


In [68]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_features = tfidf_vectorizer.fit_transform(data['review_text'])

**Model Trainning**

In [69]:
X = tfidf_features
Y = data['sentiment']
label_encoder = LabelEncoder()
Y = label_encoder.fit_transform(Y)
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


**BASE MODELS**

1. Naive Bayes (Base Model : 1)

In [78]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, Y_train)
Y_pred = nb_classifier.predict(X_test)

acc_nb = accuracy_score(Y_test , Y_pred)
print("Accuracy of Naive Bayes : " , acc_nb*100)

Accuracy of Naive Bayes :  7.142857142857142


2. Decision Tree (Base Model : 2)

In [79]:
tree = DecisionTreeClassifier()
tree.fit(X_train , Y_train)
y_pred = tree.predict(X_test)

acc_tree = accuracy_score(Y_test , Y_pred)
print("Accuracy of Decision Tree Classifier : " , acc_tree*100)

Accuracy of Decision Tree Classifier :  7.142857142857142


3. SVM (Base Model : 3)

In [81]:
svm = SVC(kernel='linear')
svm.fit(X_train , Y_train)
y_pred - svm.predict(X_test)

acc_svm = accuracy_score(Y_test , Y_pred)
print("Accuracy of SVM : " , acc_svm*100)

Accuracy of SVM :  7.142857142857142


**ENSEMBLE MODELS/TECHNIQUES**

In [84]:
ensemble_models = [
    ("Random Forest", RandomForestClassifier()),
    ("AdaBoost", AdaBoostClassifier()),
    ("Voting Classifier", VotingClassifier(estimators=[('nb', nb_classifier),
                                                        ('dt',tree),
                                                        ('svm', svm)], voting='hard'))
]

In [89]:
for name, model in ensemble_models:
    model.fit(X_train, Y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test, y_pred)
    print(f"{name} Accuracy: {accuracy *100}")
    print(f"{classification_report(Y_test, y_pred)}")
    print(f"Confusion Matrix:\n{confusion_matrix(Y_test, y_pred)}\n")

Random Forest Accuracy: 7.142857142857142
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           2       0.00      0.00      0.00         1
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         1
          12       0.00      0.00      0.00         0
          14       0.00      0.00      0.00         1
          16       0.00      0.00      0.00         0
          19       0.00      0.00      0.00         0
          20       0.00      0.00      0.00         1
          23       0.00      0.00      0.00         0
          24       0.00      0.00      0.00         1
          28       0.00      0.00      0.00         0
          32       1.00      1.00      1.00         1
          36       0.00      0.00      0.00         1
          37       0.00      0.00      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


AdaBoost Accuracy: 7.142857142857142
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.00      0.00      0.00         0
           2       0.00      0.00      0.00         1
           3       0.00      0.00      0.00         0
           4       0.00      0.00      0.00         1
           5       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         0
          11       0.00      0.00      0.00         1
          14       0.00      0.00      0.00         1
          20       0.00      0.00      0.00         1
          22       0.00      0.00      0.00         0
          24       0.00      0.00      0.00         1
          32       1.00      1.00      1.00         1
          36       0.00      0.00      0.00         1
          40       0.00      0.00      0.00         1
          53       0.00      0.00      0.00         1
          55       0.00      0.00      0.00 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


**Analysis**

All the methods' accuracy is almost same so we cannot say which one is better. Moreover, for the improvement of ensemble techniuqes , ensemble size , change of hyper parameters can increase the efficiency of ensemble techniuqes used

**Questions**

1. What could the Airbnb marketing team offer to improve its users’ experience? Should it
rank properties it suggested to users based on some metric such as review sentiment?
How would review sentiment compare to summary-rating value in terms of its ability to
predict revenues?

Ans : Airbnb's marketing team can improve user experience and revenue prediction by enhancing host support, quality standards, and incentives for positive reviews. By combining review sentiment with summary ratings, they can rank properties based on guest satisfaction and property quality, which can lead to better experiences and potential revenue growth for hosts on Airbnb.

2. Given what we know about the performance of properties in Miami and Paris, did Airbnb
need a region-specific strategy? Could the company suggest optimal pricing for hosts, or
suggest other ways hosts could improve overall earnings?

Ans : Yes they need a region specific strategy. Yes