In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from scipy.stats import uniform, loguniform, randint


In [2]:
airbnb_df = pd.read_csv("../data/AB_NYC_2019.csv")

In [3]:
airbnb_df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [4]:
train_df, test_df = train_test_split(airbnb_df, test_size=0.1, random_state=7)

In [5]:
train_df['name'] = train_df['name'].fillna('None')


lengths = train_df["name"].str.len().to_numpy()

longest = lengths.argmax()
shortest = lengths.argmin()
average = lengths.mean()

print(f"Shortest name: ",train_df["name"].iloc[shortest])
print(f"Longest name: ",train_df["name"].iloc[longest])
print(f"Average length:", average)

Shortest name:  X
Longest name:  The printing studio bedroom with garden in Bedstuy  !!Brooklyn. Historic neighborhood close to everything ! Full kitchen bathroom BBQ and porch at your disposal. Learn to print !!
Average length: 36.85840245426656


In [6]:
vec = CountVectorizer(stop_words="english", max_features=40)
X_counts = vec.fit_transform(train_df["name"])

In [7]:
bow_df = pd.DataFrame(
    X_counts.toarray(), columns=vec.get_feature_names_out(), index=train_df["name"]
)
bow_df

Unnamed: 0_level_0,1br,apartment,apt,beautiful,bed,bedroom,br,bright,brooklyn,brownstone,...,private,quiet,room,spacious,studio,sunny,upper,village,west,williamsburg
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cozy 1 BR in hip Crown Heights,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Newly Renovated Garden Apartment,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The clean separate studio in New York,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
Prospect Park Palace,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
New York style loft space,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Sunlit Studio In Modern Brownstone,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,1,0,0,0,0,0
Beautiful bright modern apartment,0,1,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
Serenity amidst the busy city,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Monthly: Gorgeous 3 Story Brownstone,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [8]:
pd.DataFrame(
    data=X_counts.sum(axis=0).tolist()[0],
    index=vec.get_feature_names_out(),
    columns=["counts"],
).sort_values("counts", ascending=False)

Unnamed: 0,counts
room,9230
bedroom,7380
private,6636
apartment,6094
cozy,4552
apt,4208
brooklyn,3757
studio,3691
spacious,3430
manhattan,3228


We can also create a CountVectoriser with `bianry=True` as it uses a different vocabulary. In hyper-parameter tuning we can check 

In [9]:
vec8_binary = CountVectorizer(binary=True, max_features=20)
X_counts = vec8_binary.fit_transform(train_df["name"])
pd.DataFrame(
    data=X_counts.sum(axis=0).tolist()[0],
    index=vec8_binary.get_feature_names_out(),
    columns=["counts"],
).sort_values("counts", ascending=False)

Unnamed: 0,counts
in,14782
room,9169
bedroom,7327
private,6516
apartment,6092
cozy,4548
apt,4197
brooklyn,3756
studio,3690
spacious,3430


## To add to hyper parameter tuning

We can play around with different max_features argument and find an optimal value for this hyper parameter.
We will do this by using RandomisedSearchCV for different values of `max_features`.

In [10]:
param_grid = {
    "countvectorizer__binary": [True, False],
    "countvectorizer__max_features": randint(low=100, high=5000)
}

In [11]:
# random_search = RandomizedSearchCV(
#         pipe, param_grid, n_iter=100, verbose=1, n_jobs=-1, random_state=123
# )

- Get a suitable binary parameter
- Get max features parameter

In [12]:

# vocab = preprocessor.named_transformers_["countvectorizer"].get_feature_names_out()
# column_names = vocab.tolist()
