In [1]:
import pandas as pd

VEGAS = '/kaggle/input/airbnb-las-vegas-listings/las_vegas_airbnb_data.csv'
df = pd.read_csv(filepath_or_buffer=VEGAS).drop(columns=['price'])
df.head()

Unnamed: 0,roomType,stars,address,numberOfGuests,primaryHost/smartName,firstReviewComments,firstReviewRating
0,Entire condo,4.54,"Las Vegas, Nevada, United States",4,Doug,"Overall, I had a good experience. The only dow...",4.0
1,Entire rental unit,4.39,"Las Vegas, Nevada, United States",4,Doug,I stayed an extra night. That should sum it up...,5.0
2,Entire guest suite,,"Las Vegas, Nevada, United States",4,Aaron&Tina,These days things are so impersonable but she ...,5.0
3,Entire condo,,"Las Vegas, Nevada, United States",6,Aaron&Tina,Great response time from owner. Could message ...,5.0
4,Private room in home,4.51,"Las Vegas, Nevada, United States",2,Brad,good group of guys in the house. quiet clean a...,5.0


In [2]:
from plotly import express
express.histogram(data_frame=df, x='address')

Unfortunately we do not have property street addresses; that would have made for a neat map.

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   roomType               1000 non-null   object 
 1   stars                  723 non-null    float64
 2   address                1000 non-null   object 
 3   numberOfGuests         1000 non-null   int64  
 4   primaryHost/smartName  1000 non-null   object 
 5   firstReviewComments    795 non-null    object 
 6   firstReviewRating      795 non-null    float64
dtypes: float64(2), int64(1), object(4)
memory usage: 54.8+ KB


In [4]:
df.nunique()

roomType                  32
stars                     65
address                    7
numberOfGuests            16
primaryHost/smartName    455
firstReviewComments      791
firstReviewRating          5
dtype: int64

In [5]:
from transformers import pipeline
from arrow import now

MODELS = [
    'bhadresh-savani/distilbert-base-uncased-emotion',
    'cardiffnlp/twitter-roberta-base-sentiment',
    'nlptown/bert-base-multilingual-uncased-sentiment',
]

time_start = now()
pipe = pipeline(task='sentiment-analysis', model=MODELS[1],)
comments = df['firstReviewComments'].fillna(value='').tolist()
sentiment = [pipe(comment) for comment in comments]
print('done with sentiment analysis in {}'.format(now() - time_start))



2024-03-02 00:33:52.417874: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-02 00:33:52.418023: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-02 00:33:52.582845: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]


TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()



vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

done with sentiment analysis in 0:02:02.282816


Our sentiment values look like this.

In [6]:
sentiment[:5]

[[{'label': 'LABEL_0', 'score': 0.5212472081184387}],
 [{'label': 'LABEL_2', 'score': 0.8912457227706909}],
 [{'label': 'LABEL_2', 'score': 0.8291773200035095}],
 [{'label': 'LABEL_2', 'score': 0.9279590845108032}],
 [{'label': 'LABEL_2', 'score': 0.9591630697250366}]]

In [7]:
if 'LABEL_0' not in df.columns:
    label_0 = []
    label_1 = []
    label_2 = []
    for item in sentiment:
        inner = item[0]
        score = inner['score']
        label = inner['label']
        if label[-1] == '0':
            label_0.append(score)
            label_1.append(0)
            label_2.append(0)
        elif label[-1] == '1':
            label_0.append(0)
            label_1.append(score)
            label_2.append(0)
        else:
            label_0.append(0)
            label_1.append(0)
            label_2.append(score)
    df['l0'] = label_0
    df['l1'] = label_1
    df['l2'] = label_2
df.head()    

Unnamed: 0,roomType,stars,address,numberOfGuests,primaryHost/smartName,firstReviewComments,firstReviewRating,l0,l1,l2
0,Entire condo,4.54,"Las Vegas, Nevada, United States",4,Doug,"Overall, I had a good experience. The only dow...",4.0,0.521247,0.0,0.0
1,Entire rental unit,4.39,"Las Vegas, Nevada, United States",4,Doug,I stayed an extra night. That should sum it up...,5.0,0.0,0.0,0.891246
2,Entire guest suite,,"Las Vegas, Nevada, United States",4,Aaron&Tina,These days things are so impersonable but she ...,5.0,0.0,0.0,0.829177
3,Entire condo,,"Las Vegas, Nevada, United States",6,Aaron&Tina,Great response time from owner. Could message ...,5.0,0.0,0.0,0.927959
4,Private room in home,4.51,"Las Vegas, Nevada, United States",2,Brad,good group of guys in the house. quiet clean a...,5.0,0.0,0.0,0.959163


We have a handful of numerical values: the number of guests, the first review rating, and the sentiment values. Let's build a model.

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

# we can only build a model where we have the output variable.
dropna_df = df.dropna(subset=['stars'])

columns = [
    'numberOfGuests', 
           'firstReviewRating', 
    'l0', 'l1', 
    'l2']
X_train, X_test, y_train, y_test = train_test_split(dropna_df[columns].values, dropna_df['stars'], test_size=0.25, random_state=2024)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print('r2: {:5.4f}'.format( model.score(X_test, y_test)))


r2: 0.0675


In [9]:
express.histogram(x=columns, y=model.coef_)

Wow that's bad. Let's look at the correlations.

In [10]:
express.imshow(img=dropna_df[columns + ['stars']].corr())

Nothing is strongly correlated with the variable of interest. Let's see what dimension reduction tells us.

In [11]:
from arrow import now
from umap import UMAP

time_start = now()
plot_df = dropna_df.copy()
umap = UMAP(random_state=2024, verbose=True, n_jobs=1, low_memory=False, n_epochs=2000)
plot_df[['x', 'y']] = umap.fit_transform(X=plot_df[columns])
print('done with UMAP in {}'.format(now() - time_start))
express.scatter(data_frame=plot_df, x='x', y='y',  color='stars').show()

UMAP(low_memory=False, n_epochs=2000, n_jobs=1, random_state=2024, verbose=True)
Sat Mar  2 00:36:29 2024 Construct fuzzy simplicial set
Sat Mar  2 00:36:29 2024 Finding Nearest Neighbors
Sat Mar  2 00:36:33 2024 Finished Nearest Neighbor Search
Sat Mar  2 00:36:37 2024 Construct embedding


Epochs completed:   0%|            0/2000 [00:00]

	completed  0  /  2000 epochs
	completed  200  /  2000 epochs
	completed  400  /  2000 epochs
	completed  600  /  2000 epochs
	completed  800  /  2000 epochs
	completed  1000  /  2000 epochs
	completed  1200  /  2000 epochs
	completed  1400  /  2000 epochs
	completed  1600  /  2000 epochs
	completed  1800  /  2000 epochs
Sat Mar  2 00:36:43 2024 Finished embedding
done with UMAP in 0:00:14.339971


UMAP shows us that there is natural clustering in our numerical data, but it doesn't tell us much about rating stars.

In [12]:
express.histogram(data_frame=dropna_df, x='stars')

This might be part of the problem; there are essentially no values below 4.5 stars.