#### Importing libraries

In [64]:
# Standard Libraries
import pandas as pd
import numpy as np

# Graphing libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# For keyword extraction
from collections import Counter
from nltk import word_tokenize, pos_tag, pos_tag_sents
from nltk.corpus import stopwords
#from rake_nltk import Rake
from textblob import TextBlob

# Snorkel libraries
from snorkel.labeling import labeling_function
from snorkel.labeling import PandasLFApplier
from snorkel.labeling import LFAnalysis
from snorkel.labeling.model import LabelModel
from snorkel.labeling import labeling_function
from snorkel.labeling.model import MajorityLabelVoter
from snorkel.analysis import get_label_buckets
from snorkel.labeling import LabelingFunction

# SKlearn libraries:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

#### Data loading and preprocessing:

In [65]:
airbnb = pd.read_csv('data/airbnbNyc.csv')

In [66]:
airbnb.shape

(25248, 17)

In [67]:
airbnb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25248 entries, 0 to 25247
Data columns (total 17 columns):
id                                25248 non-null int64
name                              25239 non-null object
host_id                           25248 non-null int64
host_name                         25239 non-null object
neighbourhood_group               25248 non-null object
neighbourhood                     25248 non-null object
latitude                          25248 non-null float64
longitude                         25248 non-null float64
room_type                         25248 non-null object
price                             25248 non-null int64
minimum_nights                    25248 non-null int64
number_of_reviews                 25248 non-null int64
last_review                       19812 non-null object
reviews_per_month                 19812 non-null float64
calculated_host_listings_count    25248 non-null int64
availability_365                  25248 non-null int64

In [68]:
# Renaming room_type column to Category to entirely reuse Amazon code:
airbnb = airbnb.rename(columns = {'room_type' : 'Category',
                                  'name' : 'description'})

In [69]:
# remove null value in the reviewText column
airbnb = airbnb[airbnb['description'].notnull()]

In [70]:
df = airbnb.copy()

In [71]:
# Get categories
classes = df['Category'].unique()
# Show array
classes

array(['Entire home/apt', 'Private room', 'Shared room'], dtype=object)

In [72]:
# Convert classes to variables and assign numerical value
for n, val in enumerate(classes):
    globals()[val] = n

In [73]:
cls = list()
for n, val in enumerate(classes):
    globals()[val] = n
    cls.append(globals()[val])

In [74]:
cls

[0, 1, 2]

In [75]:
# Add Abstain
ABSTAIN  = -1

### Labeling of Category variable:

#### Creating dictionaries with most common keywords:

In [76]:
# Converting Description column to lower:

df['description'] = df['description'].apply(lambda x: x.lower())

In [77]:
# Tokenizing reviews so that I can apply nltk.pos_tag later:

df['tokens'] = df['description'].apply(lambda x: word_tokenize(x))

In [78]:
# Applying nltk.pos_tag on "all tokens" obtained with nltk.tokenize:

df['tagsNltk'] = df['tokens'].apply(lambda x: pos_tag(x))

In [79]:
# Selecting only NN based on nltk.pos_tag:

df['nounsKeywords'] = df['tagsNltk'].apply(lambda x: [word for (word,pos) in x if(pos[:2] == 'NN' \
                                                                               or pos[:2] == 'NNS' \
                                                                               or pos[:2] == 'NNP' \
                                                                               or pos[:2] == 'NNPS' \
                                                                               or pos[:2] == 'JJ' \
                                                                               or pos[:2] == 'JJR' \
                                                                               or pos[:2] == 'JJS' \
                                                                               or pos[:2] == 'VB')])

In [80]:
# Importing stop words and excluding them from the nouns

stop_words = set(stopwords.words('english'))

In [81]:
### Creating Dictionary with keywords per category
mydict = {}

for cat in list(df['Category'].unique()):
    mydict[cat] = [df[df['Category'] == cat]['nounsKeywords'].tolist()]
    
    mydict[cat][0] = [item for items in mydict[cat][0] for item in items]
    
    mydict[cat] = Counter(mydict[cat][0]).most_common(10)
    
    mydict[cat] = [i[0] for i in mydict[cat]]
    
    mydict[cat] = [w for w in mydict[cat] if not w in stop_words]
    
#    mydict[cat] = [w for w in mydict[cat] if not w in nounsToExclude]
    
print(mydict)

{'Entire home/apt': ['apartment', 'studio', 'bedroom', 'apt', 'east', 'williamsburg', 'park', 'spacious', 'village', 'cozy'], 'Private room': ['room', 'private', 'bedroom', 'cozy', 'williamsburg', 'apartment', 'east', 'manhattan', 'bushwick', 'spacious'], 'Shared room': ['room', 'shared', 'manhattan', 'cozy', 'apartment', 'bed', 'apt', 'east', 'central', 'times']}


In [82]:
# Creating a list with all values in dictionaries:

allKeywords = []

for cat in list(df['Category'].unique()):
    allKeywords.append(mydict[cat])

In [83]:
# Flatten allKeywords into a single list (it is now a list of lists):

allKeywords = [val for sublist in allKeywords for val in sublist]

In [84]:
# Counting how many times keywords are occurring in the various dictionaries

mostCommonKeywordsCount = Counter(allKeywords)

In [85]:
# View of keywords that are common to different dictionaries:

mostCommonKeywordsCount.most_common()

[('apartment', 3),
 ('east', 3),
 ('cozy', 3),
 ('bedroom', 2),
 ('apt', 2),
 ('williamsburg', 2),
 ('spacious', 2),
 ('room', 2),
 ('manhattan', 2),
 ('studio', 1),
 ('park', 1),
 ('village', 1),
 ('private', 1),
 ('bushwick', 1),
 ('shared', 1),
 ('bed', 1),
 ('central', 1),
 ('times', 1)]

In [86]:
# Creating a dictionary of words that have at least min_threshold occurrencies:

mostCommonKeywords = {x: count for x, count in mostCommonKeywordsCount.items() if count > 2}

In [87]:
# Creating list of keywords to exclude

keywordsToExclude = mostCommonKeywords.keys()


In [88]:
# Adjusting dictionaries, based on new filtered keywords:

mydictFiltered = {}

for cat in list(df['Category'].unique()):
    
    mydictFiltered[cat] = [w for w in mydict[cat] if not w in keywordsToExclude]
    
print(mydictFiltered)

{'Entire home/apt': ['studio', 'bedroom', 'apt', 'williamsburg', 'park', 'spacious', 'village'], 'Private room': ['room', 'private', 'bedroom', 'williamsburg', 'manhattan', 'bushwick', 'spacious'], 'Shared room': ['room', 'shared', 'manhattan', 'bed', 'apt', 'central', 'times']}


In [89]:
# Dictionaries generated 

mydictFiltered

{'Entire home/apt': ['studio',
  'bedroom',
  'apt',
  'williamsburg',
  'park',
  'spacious',
  'village'],
 'Private room': ['room',
  'private',
  'bedroom',
  'williamsburg',
  'manhattan',
  'bushwick',
  'spacious'],
 'Shared room': ['room',
  'shared',
  'manhattan',
  'bed',
  'apt',
  'central',
  'times']}

In [90]:
import json
with open('data.json', 'w') as fp:
    json.dump(mydictFiltered, fp)

#### Creating Keyword Look Ups Labelling Functions

In [91]:
mydictFiltered

{'Entire home/apt': ['studio',
  'bedroom',
  'apt',
  'williamsburg',
  'park',
  'spacious',
  'village'],
 'Private room': ['room',
  'private',
  'bedroom',
  'williamsburg',
  'manhattan',
  'bushwick',
  'spacious'],
 'Shared room': ['room',
  'shared',
  'manhattan',
  'bed',
  'apt',
  'central',
  'times']}

In [92]:
labels = cls

In [93]:
labels

[0, 1, 2]

In [94]:
keys = list(mydictFiltered.values())
keys

[['studio', 'bedroom', 'apt', 'williamsburg', 'park', 'spacious', 'village'],
 ['room',
  'private',
  'bedroom',
  'williamsburg',
  'manhattan',
  'bushwick',
  'spacious'],
 ['room', 'shared', 'manhattan', 'bed', 'apt', 'central', 'times']]

#### Creating Keyword Specific Labelling Functions

In [95]:
def keyword_lookup(x, keywords, label):
    return label if keywords in x.tokens else ABSTAIN


def make_keyword_lf(keywords, label):
            return LabelingFunction(
                name=f"keyword_{keywords}_{label}",
                f=keyword_lookup,
                resources=dict(keywords=keywords, label=label),
            )

In [96]:
lfs = list()

for i in range(0, len(keys)): 
    for j in range(0, len(keys[i])):
               mk = make_keyword_lf(keywords = keys[i][j], label = labels[i])
               lfs.append(mk)

In [97]:
lfs

[LabelingFunction keyword_studio_0, Preprocessors: [],
 LabelingFunction keyword_bedroom_0, Preprocessors: [],
 LabelingFunction keyword_apt_0, Preprocessors: [],
 LabelingFunction keyword_williamsburg_0, Preprocessors: [],
 LabelingFunction keyword_park_0, Preprocessors: [],
 LabelingFunction keyword_spacious_0, Preprocessors: [],
 LabelingFunction keyword_village_0, Preprocessors: [],
 LabelingFunction keyword_room_1, Preprocessors: [],
 LabelingFunction keyword_private_1, Preprocessors: [],
 LabelingFunction keyword_bedroom_1, Preprocessors: [],
 LabelingFunction keyword_williamsburg_1, Preprocessors: [],
 LabelingFunction keyword_manhattan_1, Preprocessors: [],
 LabelingFunction keyword_bushwick_1, Preprocessors: [],
 LabelingFunction keyword_spacious_1, Preprocessors: [],
 LabelingFunction keyword_room_2, Preprocessors: [],
 LabelingFunction keyword_shared_2, Preprocessors: [],
 LabelingFunction keyword_manhattan_2, Preprocessors: [],
 LabelingFunction keyword_bed_2, Preprocessors

---

#### Finding numerical price threshold per cagtegory:

In [98]:
### Generating summary table to identify distribution:

ent = pd.DataFrame(airbnb[airbnb['Category'] == 'Entire home/apt']['price'].describe()).transpose().rename(index = {'price':'Entire home/apt'})
pri = pd.DataFrame(airbnb[airbnb['Category'] == 'Private room']['price'].describe()).transpose().rename(index = {'price':'Private room'}) 
sha = pd.DataFrame(airbnb[airbnb['Category'] == 'Shared room']['price'].describe()).transpose().rename(index = {'price':'Shared room'})
summaryDf = pd.concat([ent, pri, sha])

In [99]:
# Resetting index as a column, to make it all a dataframe for later usage

summaryDf.reset_index(level=0, inplace = True)

In [100]:
summaryDf

Unnamed: 0,index,count,mean,std,min,25%,50%,75%,max
0,Entire home/apt,14113.0,227.824205,308.461037,0.0,130.0,175.0,250.0,10000.0
1,Private room,10552.0,96.956406,187.73261,0.0,60.0,76.0,100.0,10000.0
2,Shared room,574.0,83.679443,123.181112,0.0,39.0,60.0,80.0,1800.0


In [101]:
# Converting numerical column from flot to integer:

cols = ['min', '25%', '50%', '75%', 'max']
summaryDf[cols] = summaryDf[cols].applymap(np.int64)

#### Creating Numerical Labelling Functions

In [102]:
# Defining numerical functions:

def num_threshold(x, thresholdLow, thresholdHigh, label):
    if ((x.price > thresholdLow) & (x.price < thresholdHigh)):
        return label
    return ABSTAIN


def make_numerical_lf(thresholdLow, thresholdHigh, label):
    return LabelingFunction(
        name=f"class_num_{label}",
        f=num_threshold,
        resources=dict(thresholdLow = thresholdLow, thresholdHigh = thresholdHigh, label = label),
    )

In [103]:
# Convert classes to variables and assign numerical value

for n, val in enumerate(classes):
    globals()[val] = n

In [104]:
# Defining lower bound threshold:

tLow = summaryDf['25%'].tolist()

In [105]:
tLow

[130, 60, 39]

In [106]:
# Defining upper bound threshold:

tHigh = summaryDf['75%'].tolist()

In [107]:
for i in range(0, len(classes)):
    name_num = f"class_num_{labels[i]}"
    globals()[name_num] = make_numerical_lf(thresholdLow = tLow[i], thresholdHigh = tHigh[i], label = labels[i])
    lfs.append(globals()[name_num])

In [108]:
# Checking LFs created: 

lfs

[LabelingFunction keyword_studio_0, Preprocessors: [],
 LabelingFunction keyword_bedroom_0, Preprocessors: [],
 LabelingFunction keyword_apt_0, Preprocessors: [],
 LabelingFunction keyword_williamsburg_0, Preprocessors: [],
 LabelingFunction keyword_park_0, Preprocessors: [],
 LabelingFunction keyword_spacious_0, Preprocessors: [],
 LabelingFunction keyword_village_0, Preprocessors: [],
 LabelingFunction keyword_room_1, Preprocessors: [],
 LabelingFunction keyword_private_1, Preprocessors: [],
 LabelingFunction keyword_bedroom_1, Preprocessors: [],
 LabelingFunction keyword_williamsburg_1, Preprocessors: [],
 LabelingFunction keyword_manhattan_1, Preprocessors: [],
 LabelingFunction keyword_bushwick_1, Preprocessors: [],
 LabelingFunction keyword_spacious_1, Preprocessors: [],
 LabelingFunction keyword_room_2, Preprocessors: [],
 LabelingFunction keyword_shared_2, Preprocessors: [],
 LabelingFunction keyword_manhattan_2, Preprocessors: [],
 LabelingFunction keyword_bed_2, Preprocessors

#### Train and test split:

In [109]:
# Creating a dictionary to map category variable:
cat = dict(zip(classes, cls))
print(cat)

{'Entire home/apt': 0, 'Private room': 1, 'Shared room': 2}


In [110]:
# Reclassifying category into numerical in the dataframe:
df['Category'] = df['Category'].map(cat)

In [111]:
# Train and test split:

X = df.drop(['Category'], axis = 1)

In [112]:
y = df['Category']

In [113]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=42)

#### Applying Labeling Models:

In [114]:
# Apply the LFs to the unlabeled training data
applier = PandasLFApplier(lfs)
L_train = applier.apply(df=X_train)
L_test = applier.apply(df=X_test)
LFAnalysis(L=L_train, lfs=lfs).lf_summary()

100%|███████████████████████████████████████████████████████████████████████████| 20191/20191 [00:41<00:00, 482.39it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 5048/5048 [00:09<00:00, 559.04it/s]


Unnamed: 0,j,Polarity,Coverage,Overlaps,Conflicts
keyword_studio_0,0,[0],0.094894,0.076173,0.042098
keyword_bedroom_0,1,[0],0.159576,0.159576,0.159576
keyword_apt_0,2,[0],0.105988,0.105988,0.105988
keyword_williamsburg_0,3,[0],0.09846,0.09846,0.09846
keyword_park_0,4,[0],0.069387,0.062503,0.054381
keyword_spacious_0,5,[0],0.078302,0.078302,0.078302
keyword_village_0,6,[0],0.067159,0.055371,0.037442
keyword_room_1,7,[1],0.189391,0.189391,0.189391
keyword_private_1,8,[1],0.137289,0.130801,0.128374
keyword_bedroom_1,9,[1],0.159576,0.159576,0.159576


In [115]:
# Train the label model and compute the training labels
label_model = LabelModel(cardinality=3, verbose=True)

In [116]:
label_model.fit(L_train, n_epochs=500, log_freq=50)

In [117]:
label_model_acc = label_model.score(L = L_test, Y = y_test, tie_break_policy="random")["accuracy"]
#print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

In [118]:
print(f"{'Label Model Accuracy:':<25} {label_model_acc * 100:.1f}%")

Label Model Accuracy:     66.3%


In [119]:
# Applying Majority Vote Model and checking accuracy:

majority_model = MajorityLabelVoter(cardinality = 3)
preds_train = majority_model.predict(L = L_train)

In [120]:
majority_acc = majority_model.score(L = L_test, Y = y_test, tie_break_policy="random")["accuracy"]


In [121]:
print(f"{'Majority Vote Accuracy:':<25} {majority_acc * 100:.1f}%")

Majority Vote Accuracy:   63.5%


---

#### Generating final output table, including label

In [122]:
X_test['category_LF'] = label_model.predict(L = L_test, tie_break_policy="abstain")

In [123]:
# Generating cat dictionary to map final label
cat = dict(zip(cls, classes))
print(cat)

{0: 'Entire home/apt', 1: 'Private room', 2: 'Shared room'}


In [124]:
X_test['category_label'] = X_test['category_LF'].map(cat)

In [125]:
X_test.head()

Unnamed: 0,id,description,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,price,minimum_nights,...,last_review,reviews_per_month,calculated_host_listings_count,availability_365,budget,tokens,tagsNltk,nounsKeywords,category_LF,category_label
19738,13207586,beautiful 1br ues,74020712,Nicole,Manhattan,Upper East Side,40.7722,-73.95272,115,3,...,2017-02-20,0.22,1,0,Average,"[beautiful, 1br, ues]","[(beautiful, JJ), (1br, CD), (ues, NNS)]","[beautiful, ues]",-1,
21273,16965705,bright sunny manhattan getaway,3483600,Joshua,Manhattan,Washington Heights,40.83498,-73.94214,63,2,...,,,1,0,Cheap,"[bright, sunny, manhattan, getaway]","[(bright, JJ), (sunny, JJ), (manhattan, NN), (...","[bright, sunny, manhattan, getaway]",1,Private room
15548,15884157,private bedroom bushwick close to l and j trains,16973022,Patrick,Brooklyn,Bushwick,40.69092,-73.91195,50,1,...,2016-11-26,0.12,1,0,Cheap,"[private, bedroom, bushwick, close, to, l, and...","[(private, JJ), (bedroom, NN), (bushwick, NN),...","[private, bedroom, bushwick, l, j, trains]",1,Private room
12571,35915030,lovely north park slope private garden apartment,270144441,Carla & Ken,Brooklyn,Park Slope,40.67817,-73.97603,145,6,...,,,1,14,Average,"[lovely, north, park, slope, private, garden, ...","[(lovely, RB), (north, JJ), (park, JJ), (slope...","[north, park, slope, private, garden, apartment]",2,Shared room
20319,25058563,art studio by the park,13583715,Mona,Manhattan,Upper East Side,40.77788,-73.96114,165,30,...,2018-06-27,0.29,1,203,Average,"[art, studio, by, the, park]","[(art, NN), (studio, NN), (by, IN), (the, DT),...","[art, studio, park]",0,Entire home/apt


In [126]:
df_output = X_test

---