In [215]:
import pandas as pd
import json
import csv
from matplotlib import pyplot as plt
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import numpy as np
import math


# Detailed Listings Formatting

In [216]:
#These functions help transform the amenities column from a list-like string to a list that MultiLabelBinarizer can work on
def strip_split(x):
    return x.strip("[]' ").split(",")

def striper(x):
    return x.strip("[]' ")

def formater(x):
    alist = list(map(strip_split, x))
    blist = []
    for i in range(0,len(alist)):
        blist.append(list(map(striper, alist[i])))
    return blist

In [217]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
detailed_listings_back = pd.read_csv("l0_detailed_listings.csv", encoding = "UTF-8")
detailed_listings_back.columns = detailed_listings_back.columns.str.replace("content.","")
detailed_listings_back.city = detailed_listings_back.city.str.strip()

if 'amenities' in detailed_listings_back.columns:
    s = detailed_listings_back['amenities']
    mlb = MultiLabelBinarizer()
    amenities_df = pd.DataFrame(mlb.fit_transform(formater(s)),columns=mlb.classes_, index=detailed_listings_back.index)
    amenities_df["Laptop-friendly workspace"] = amenities_df.loc[:,['Laptop-friendly workspace','Laptop friendly workspace']].sum(axis=1)
    amenities_df = amenities_df.drop('Laptop friendly workspace',axis=1)
    if "Wifi" not in detailed_listings_back.columns:
        detailed_listings_back = detailed_listings_back.join(amenities_df)
    detailed_listings_back.drop("amenities", axis = 1)

InitialMissingCells = np.sum(detailed_listings_back.isnull().sum()) 
InitialRowCount = detailed_listings_back.shape[0]
InitialColumnCount = detailed_listings_back.shape[1]
print("Dataset shape is " + str(detailed_listings_back.shape))
print(str(InitialMissingCells) + " null cells in initial dataset")
print(str(InitialMissingCells/InitialRowCount/InitialColumnCount) + "% of dataset is null")
amenities_csv_data = amenities_df.to_csv('l0_amenities.csv', index = False) 


Dataset shape is (1497, 282)
25057 null cells in initial dataset
0.05935511685309152% of dataset is null


# Variance Filtering

In [218]:
lVarDrops = amenities_df.var() < (math.log10(amenities_df.shape[0])/1497)
vdrops = lVarDrops.iloc[lVarDrops.values == True].keys()
for vdrop in vdrops:
    if vdrop in detailed_listings_back.columns:
        detailed_listings_back = detailed_listings_back.drop(vdrop, axis = 1)

for vdrop in vdrops:
    if vdrop in amenities_df.columns:
        amenities_df = amenities_df.drop(vdrop, axis = 1)

amdrop_csv_data = amenities_df.to_csv('l1_amenities.csv', index = False) 
#amenities_df.to_pickle('Plotly-Dash/ammenity.data')
print("Amenities dropped for low variance are:")
print(lVarDrops.iloc[lVarDrops.values == True].keys())
print("Dataset shape is " + str(detailed_listings_back.shape))
print(str(InitialMissingCells) + " null cells in initial dataset")
print(str(np.sum(detailed_listings_back.isnull().sum()))+ " in current dataset")
print(str(np.sum(detailed_listings_back.isnull().sum())/detailed_listings_back.shape[0]/detailed_listings_back.shape[1]) + "% of dataset is null")


Amenities dropped for low variance are:
Index(['Amazon Echo', 'Balcony', 'Day bed', 'Espresso machine', 'Fire pit',
       'Firm mattress', 'Fixed grab bars for shower', 'Hand Sanitiser',
       'Handheld shower head', 'Heat lamps', 'Heated floors', 'Jetted tub',
       'Mudroom', 'Murphy bed', 'Nespresso machine', 'Other pet(s)',
       'Outdoor parking', 'Piano', 'Pillow-top mattress', 'Playground',
       'Printer', 'Record player', 'Roll-in shower', 'Terrace',
       'Trash compacter', 'Wide doorway to guest bathroom', 'Wide entryway'],
      dtype='object')
Dataset shape is (1497, 255)
25057 null cells in initial dataset
25057 in current dataset
0.06563977628459533% of dataset is null


In [219]:
sum(detailed_listings_back.rental_income.isnull()==True)

23

# Data Cleaning

In [220]:
#Normalizing City Names
if 'city' in detailed_listings_back.columns:
    detailed_listings_back['city'] = detailed_listings_back['city'].replace(['Passyunk Square, Philadelphia'],'Philadelphia')
    detailed_listings_back['city'] = detailed_listings_back['city'].replace(np.nan, "Philedelphia") #Inferred from Neighborhood

#Removing very inconsequential missing values. THIS LINE NEEDS TO BE ACTIVATED BEFORE USE!!!
lmDrops = math.log10(len(detailed_listings_back))/detailed_listings_back.shape[0]
detailed_listings_back.dropna(axis = 0, subset = detailed_listings_back.columns[detailed_listings_back.isnull().sum()/len(detailed_listings_back) <= lmDrops].to_list()).shape

#imputing normal daily price to observations with no weekend rate (confirmed w/ manual website inspection)
detailed_listings_back['listing_weekend_price_native'] = np.where(detailed_listings_back['listing_weekend_price_native'].isnull(), detailed_listings_back['price'], detailed_listings_back['listing_weekend_price_native'])

#monthly_price_factor and weekly_price_factor are not listed in the API documentation but appear to correspond with 
#discount percentages on the airbnb website. Imputing 1 where it is missing
detailed_listings_back['monthly_price_factor'] = detailed_listings_back['monthly_price_factor'].replace(np.nan, 1)
if 'weekly_price_factor' in detailed_listings_back.columns:
    detailed_listings_back['weekly_price_factor'] = detailed_listings_back['weekly_price_factor'].replace(np.nan, 1)

#min_nights_input_values has lots ot NA values but no zeroes, so I'm taking that as 1.
#I don't think AirBnb rents by the hour anyway :)
if 'min_nights_input_value' in detailed_listings_back.columns:
    detailed_listings_back['min_nights_input_value'] = detailed_listings_back['min_nights_input_value'].replace(np.nan, 1)

###NAs in these three seem to be connected. Deleting all rows with NAs in occupancy results in removal of all rows with
###NAs in the other two and only one row with an NA in occupancy has a value in rental income, which was very low

#occupancy has lots ot NA values but no zeroes, so I'm taking that as 0.
if 'occupancy' in detailed_listings_back.columns:
    detailed_listings_back['occupancy'] = detailed_listings_back['occupancy'].replace(np.nan, 0)

#rental_income has lots ot NA values but no zeroes, so I'm taking that as 0.
detailed_listings_back['rental_income'] = detailed_listings_back['rental_income'].replace(np.nan, 0)

print("Dataset shape is " + str(detailed_listings_back.shape))
print(str(InitialMissingCells) + " null cells in initial dataset")
print(str(np.sum(detailed_listings_back.isnull().sum()))+ " in current dataset")
print(str(np.sum(detailed_listings_back.isnull().sum())/detailed_listings_back.shape[0]/detailed_listings_back.shape[1]) + "% of dataset is null")

Dataset shape is (1497, 255)
25057 null cells in initial dataset
23446 in current dataset
0.06141957116848075% of dataset is null


In [221]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

fvalue_selector = SelectKBest(f_classif, k=130)

# Apply the SelectKBest object to the features and target
fvalue_selector.fit(amenities_df, detailed_listings_back.rental_income)
amenities_df.iloc[:,fvalue_selector.pvalues_>.05].columns
#scores /= scores.max()
#X_indices=np.arange(amenities_df.shape[-1])
#plt.bar(X_indices - .45, scores, width=.2,
#        label=r'Univariate score ($-Log(p_{value})$)')

fDrops = amenities_df.iloc[:,fvalue_selector.pvalues_>.05]
for f in fDrops.columns:
    if f in detailed_listings_back.columns:
        detailed_listings_back = detailed_listings_back.drop(f, axis = 1)

for f in fDrops:
    if f in amenities_df.columns:
        amenities_df = amenities_df.drop(f, axis = 1)

amdrop_csv_data = amenities_df.to_csv('l2_amenities.csv', index = False) 
amenities_df.to_pickle('Plotly-Dash/ammenity.data')
print("Amenities eliminated by f test are:")
print(fDrops.columns)
print("Dataset shape is " + str(detailed_listings_back.shape))
print(str(InitialMissingCells) + " null cells in initial dataset")
print(str(np.sum(detailed_listings_back.isnull().sum()))+ " in current dataset")
print(str(np.sum(detailed_listings_back.isnull().sum())/detailed_listings_back.shape[0]/detailed_listings_back.shape[1]) + "% of dataset is null")


  f = msb / msw


Amenities eliminated by f test are:
Index(['24-hour check-in', 'Accessible-height bed', 'Accessible-height toilet',
       'Air conditioning', 'Beach essentials', 'Bed linens', 'Bread maker',
       'Breakfast', 'Cable TV', 'Carbon monoxide alarm',
       'Carbon monoxide detector', 'Ceiling fan', 'Central air conditioning',
       'Children’s books and toys', 'Children’s dinnerware',
       'Cleaning before checkout', 'Coffee maker', 'Cooking basics',
       'Dishes and silverware', 'Dryer', 'EV charger', 'En suite bathroom',
       'Essentials', 'Extra pillows and blankets', 'Extra space around bed',
       'Family/kid friendly', 'Fire extinguisher', 'First aid kit',
       'Flat path to guest entrance', 'Free street parking', 'Freezer',
       'Full kitchen', 'Garden or backyard', 'Gas oven', 'Gym', 'HBO GO',
       'Hair dryer', 'Hangers', 'Heating', 'High chair', 'Host greets you',
       'Hot water', 'Hot water kettle', 'Internet', 'Iron', 'Keypad',
       'Kitchen', 'Laptop-frie

In [222]:
fvalue_selector.get_support

<bound method SelectorMixin.get_support of SelectKBest(k=130)>

# Coffee Variable

In [223]:
detailed_listings_back.head()

Unnamed: 0,status,id,city,picture_url,thumbnail_url,medium_url,xl_picture_url,user_id,price,native_currency,price_native,price_formatted,lat,lng,country,name,smart_location,has_double_blind_reviews,instant_bookable,bedrooms,beds,bathrooms,market,min_nights,neighborhood,person_capacity,state,zipcode,address,country_code,cancellation_policy,property_type,reviews_count,room_type,room_type_category,picture_count,currency_symbol_left,currency_symbol_right,bed_type,bed_type_category,require_guest_profile_picture,require_guest_phone_verification,force_mobile_legal_modal,cancel_policy,check_in_time,check_out_time,guests_included,license,max_nights,square_feet,locale,has_viewed_terms,has_viewed_cleaning,has_agreed_to_legal_terms,has_viewed_ib_perf_dashboard_panel,language,public_address,map_image_url,experiences_offered,max_nights_input_value,min_nights_input_value,requires_license,property_type_id,house_rules,security_deposit_native,security_price_native,security_deposit_formatted,description,description_locale,summary,space,access,interaction,neighborhood_overview,transit,amenities,is_location_exact,cancel_policy_short_str,star_rating,price_for_extra_person_native,weekly_price_native,monthly_price_native,time_zone_name,loc.type,loc.coordinates,exists,created_at,updated_at,cleaning_fee_native,extras_price_native,in_building,in_toto_area,instant_book_enabled,is_business_travel_ready,listing_cleaning_fee_native,listing_monthly_price_native,listing_price_for_extra_person_native,listing_weekend_price_native,listing_weekly_price_native,localized_city,monthly_price_factor,special_offer,toto_opt_in,weekly_price_factor,wireless_info,host_id,airbnb_id,mashvisor_id,occupancy,rental_income,nights_booked,BBQ grill,Baby bath,Baby monitor,Babysitter recommendations,Baking sheet,Barbecue utensils,Bath towel,Bathroom essentials,Bathtub,Bedroom comforts,Body soap,Breakfast table,Building staff,Buzzer/wireless intercom,Cat(s),Changing table,Conditioner,Convection oven,Crib,DVD player,Disabled parking spot,Dishwasher,Dog(s),Elevator,Ethernet connection,Fireplace guards,Formal dining area,Free parking on premises,Game console,Hot tub,Indoor fireplace,Keurig coffee machine,Kitchenette,Laundromat nearby,Memory foam mattress,Mini fridge,Netflix,Outdoor seating,Patio or balcony,Pets live on this property,Pool,Rain shower,Self check-in,Stair gates,Table corner guards,Toilet paper,Wide entrance,Window guards
0,success,24930469,Philadelphia,https://a0.muscache.com/im/pictures/a262704d-6...,https://a0.muscache.com/im/pictures/a262704d-6...,https://a0.muscache.com/im/pictures/a262704d-6...,https://a0.muscache.com/im/pictures/a262704d-6...,6338241,295,USD,295,$295,39.96928,-75.14747,United States,Spacious Artistic Home-Outdoor Deck-FREE PARKING,"Philadelphia, PA",False,False,3.0,3,1.5,Philadelphia,2,Poplar,6,PA,19123,"Philadelphia, PA, United States",US,flexible,House,35,Entire home/apt,entire_home,15,$,,Real Bed,real_bed,False,False,False,3,,,7,,1125,,en,,,True,,en,"Philadelphia, PA, United States",https://maps.googleapis.com/maps/api/staticmap...,none,,2.0,False,2,,475.0,475.0,$475,"Northern Liberties, Philadelphia -2058 sq ft h...",en,"Northern Liberties, Philadelphia -2058 sq ft h...",The outdoor space spring through fall provides...,Access to the entire house,,Northern Liberties Neighborhood.\n\n5 blocks f...,Girard Ave. Trolly line stops 2 blocks away an...,"['TV', 'Cable TV', 'Internet', 'Wifi', 'Air co...",True,Flexible,5.0,0,,,America/New_York,Point,"[-75.14747, 39.96928]",True,2018-05-31T08:02:23.747Z,2020-09-25T06:44:16.241Z,85.0,85.0,False,False,False,,85.0,,0,295.0,,Philadelphia,0.75,,,0.85,,6338241,10097839,24930469,4.0,379.0,9.0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1
1,success,24927979,Philadelphia,https://a0.muscache.com/im/pictures/fb593a62-b...,https://a0.muscache.com/im/pictures/fb593a62-b...,https://a0.muscache.com/im/pictures/fb593a62-b...,https://a0.muscache.com/im/pictures/fb593a62-b...,51857742,45,USD,45,$45,39.97869,-75.24558,United States,The Sanctuary Studio @ 63rd Street,"Philadelphia, PA",False,False,0.0,2,1.0,Philadelphia,15,West Philadelphia,3,PA,19151,"Philadelphia, PA, United States",US,moderate,Apartment,21,Entire home/apt,entire_home,10,$,,Real Bed,real_bed,False,False,False,4,15.0,11.0,2,,90,,en,,,True,,en,"Philadelphia, PA, United States",https://maps.googleapis.com/maps/api/staticmap...,none,90.0,15.0,False,1,#NAME?,1000.0,1000.0,"$1,000",Enjoy Complete Privacy in Your Own Apartment!\...,en,Enjoy Complete Privacy in Your Own Apartment!\...,"The Sanctuary Studio on 63rd Street, has a #10...",,"The Studio is your Sanctuary in Philadelphia, ...",Whether you ride North to City Avenue: for St....,Transportation Is A Breeze:\nThe #10 Trolley s...,"['TV', 'Cable TV', 'Wifi', 'Air conditioning',...",True,Moderate,4.5,20,,,America/New_York,Point,"[-75.24558, 39.97869]",True,2019-07-17T16:01:00.105Z,2020-09-25T06:56:18.778Z,60.0,60.0,False,False,False,,60.0,,20,45.0,,Philadelphia,0.9,,,0.9,,51857742,10105308,24927979,83.0,1423.0,305.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
2,success,24933956,Philadelphia,https://a0.muscache.com/im/pictures/8fa08baa-1...,https://a0.muscache.com/im/pictures/8fa08baa-1...,https://a0.muscache.com/im/pictures/8fa08baa-1...,https://a0.muscache.com/im/pictures/8fa08baa-1...,11036611,118,USD,118,$118,39.96989,-75.13693,United States,Hero's Castle★Huge 3rd Floor Suite+GATED Parking!,"Philadelphia, PA",False,False,1.0,1,1.0,Philadelphia,1,North Philadelphia,2,PA,19122,"Philadelphia, PA, United States",US,moderate,Guest suite,773,Entire home/apt,entire_home,24,$,,Real Bed,real_bed,False,False,False,4,15.0,11.0,1,,7,,en,True,,True,,en,"Philadelphia, PA, United States",https://maps.googleapis.com/maps/api/staticmap...,none,7.0,1.0,False,53,We're looking forward to your visit! Please n...,495.0,495.0,$495,"No matter how your 2020 has gone so far, you d...",en,"No matter how your 2020 has gone so far, you d...",When a more luxurious AirBnb experience is you...,★This entire guest suite can be locked for pri...,We'll generally either be traveling on busines...,Safe. Diverse. Friendly. Fun! Highly conve...,➼ MFL subway around the corner goes to Center...,"['TV', 'Cable TV', 'Internet', 'Wifi', 'Air co...",True,Moderate,5.0,10,300.0,1800.0,America/New_York,Point,"[-75.13693, 39.96989]",True,2016-03-26T16:35:18.132Z,2020-09-29T06:07:26.260Z,25.0,25.0,False,False,True,,25.0,1800.0,10,118.0,300.0,Philadelphia,1.0,,,1.0,,11036611,10149981,24933956,54.0,1822.0,197.0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0,0,0,1,0,0,0,0,0
3,success,24935175,Philadelphia,https://a0.muscache.com/im/pictures/33451bee-1...,https://a0.muscache.com/im/pictures/33451bee-1...,https://a0.muscache.com/im/pictures/33451bee-1...,https://a0.muscache.com/im/pictures/33451bee-1...,45864129,130,USD,130,$130,39.92027,-75.17051,United States,Quiet Clean remodeled 3BR house by Sport Complex,"Philadelphia, PA",False,False,3.0,6,1.5,Philadelphia,2,South Philadelphia,8,PA,19145,"Philadelphia, PA, United States",US,strict_14_with_grace_period,Townhouse,189,Entire home/apt,entire_home,29,$,,Real Bed,real_bed,False,False,False,44,,12.0,4,,1125,,en,True,,True,,en,"Philadelphia, PA, United States",https://maps.googleapis.com/maps/api/staticmap...,none,,2.0,False,36,"This is my home away from home, and I request ...",250.0,250.0,$250,Home away from home! \nRecently remodeled home...,en,Home away from home! \nRecently remodeled home...,"Recently remodeled, the ""straight-through"" lay...",Check-in can be flexible if it is vacant prior...,Mostly I won't meet my guests as I have self c...,"A working class, residential neighborhood with...","Extremely convenient, only 2 blocks to the Org...","['TV', 'Cable TV', 'Internet', 'Wifi', 'Air co...",True,Strict (grace period),5.0,15,,,America/New_York,Point,"[-75.17051, 39.92027]",True,2016-03-26T18:36:33.873Z,2020-09-29T06:04:00.123Z,60.0,60.0,False,False,False,,60.0,,15,160.0,,Philadelphia,0.8,,,0.9,,45864129,10250668,24935175,35.0,1503.0,127.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0
4,success,24931732,Philadelphia,https://a0.muscache.com/im/pictures/b39accd6-c...,https://a0.muscache.com/im/pictures/b39accd6-c...,https://a0.muscache.com/im/pictures/b39accd6-c...,https://a0.muscache.com/im/pictures/b39accd6-c...,53420712,125,USD,125,$125,39.94862,-75.14515,United States,Sun-filled Old City Apartment,"Philadelphia, PA",False,False,1.0,1,1.0,Philadelphia,2,Old City,2,PA,19106,"Philadelphia, PA, United States",US,moderate,Apartment,185,Entire home/apt,entire_home,12,$,,Real Bed,real_bed,False,False,False,4,15.0,11.0,1,,365,,en,True,,True,,en,"Philadelphia, PA, United States",https://maps.googleapis.com/maps/api/staticmap...,none,365.0,2.0,False,1,,100.0,100.0,$100,"Beautiful, bright, 1 BR apartment with large b...",en,"Beautiful, bright, 1 BR apartment with large b...","Large bedroom, LR/DR/Den, full kitchen.\n\nTh...",Please note the apartment is on the 2nd floor ...,,There are always great festivals and events go...,Parking:\nThe apartment is located in Old City...,"['TV', 'Cable TV', 'Internet', 'Wifi', 'Air co...",True,Moderate,5.0,0,,,America/New_York,Point,"[-75.14515, 39.94862]",True,2016-03-26T15:44:06.465Z,2020-09-29T07:01:48.081Z,50.0,50.0,False,False,True,,50.0,,0,125.0,,Philadelphia,0.8,,,0.9,,53420712,10377860,24931732,23.0,796.0,64.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0


In [224]:
#detailed_listings_back[(detailed_listings_back['Keurig coffee machine'] == 1) & (detailed_listings_back['Pour Over Coffee'] ==1)]
#np.sum(detailed_listings_back['Coffee maker']==1)

In [225]:
# pd.set_option('display.width', None)
# #pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.max_colwidth', -1)
# detailed_listings_back[['name', 'Kitchen', 'Full kitchen', 'Kitchenette', 'Oven', 'Keurig coffee machine', 'Coffee maker']]
# #detailed_listings_back.head(5)

#detailed_listings_back[(detailed_listings_back['Pour Over Coffee'] == 1) & (detailed_listings_back['Keurig coffee machine'] ==1)]
#sum(detailed_listings_back['Coffee maker'] ==1)
# detailed_listings_back['Coffee Type'] = 'None'
# if detailed_listings_back['Coffee Maker'] == 1:
#     detailed_listings_back['Coffee Type'] = 'Coffee Maker'
# elif detailed_listings_back['Keurig Coffee Machine'] = 1:
#     detailed_listings_back['Coffee Type'] = 'Keurig Coffee Machine'
# elif detailed_listings_back['']

# if detailed_listings_back['Keurig Coffee Machine'] ==1:
#     detailed_listings_back['Coffee Maker'] = 0
#sum(detailed_listings_back['Kitchen']==1)
#detailed_listings_back.shape

# Kitchen Variable

In [226]:
#detailed_listings_back[['name','Kitchen', 'Full kitchen', 'Kitchenette']]
#detailed_listings_back.Kitchen

# Washer/Dryer

###I dropped rows that have a dryer but no washer

In [227]:
##detailed_listings_back=detailed_listings_back.drop(detailed_listings_back[(detailed_listings_back['Washer'] == 0) & (detailed_listings_back['Dryer'] ==1)].index, axis=0)
#print("Dataset shape is " + str(detailed_listings_back.shape))
#print(str(InitialMissingCells) + " null cells in initial dataset")
##print(str(np.sum(detailed_listings_back.isnull().sum()))+ " in current dataset")
#print(str(np.sum(detailed_listings_back.isnull().sum())/detailed_listings_back.shape[0]/detailed_listings_back.shape[1]) + "% of dataset is null")

In [228]:
detailed_listings_csv_data = detailed_listings_back.to_csv('l1_detailed_listings.csv', index = False) 

# Column Drops

In [229]:
#Drops for relevance: status, id,picture_url, thumbnail_url, medium_url,xl_picture_url, country, country_code,
#market, state, bed_type (identical to bed_type_category), amenities (unnescesary after binarization), 
#native_price(identical to price), price_formatted(formatted as string), security_deposit_formatted (formatted as string)
#security_price_native(identical to security_deposit_native), smart_location(identical to city), address (identical to city)
#loc.coordinates(identical to lat,lng), localized_city(identical to city), airbnb_id(each one is unique), mashvisor_id(each is unique)
#host_id(identical to user_id), listing_monthly_price_native (identical to monthly_price_native),
#listing_weekly_price_native (identical to weekly_price_native), extras_price_native (identical to cleaning_fee_native),
#listing_cleaning_fee_native(identical to cleaning_fee_native), language(identical to locale), Other (Not meaningful),
#listing_price_for_extra_person_native, min_nights_input_value, max_nights_input_value
#cancel_policy, has_agreed_to_legal_terms, property_type_id, cancel_policy_short_str

#Drops for simplicity: name, public_address, house_rules, description, summary, space, access, interaction, transit,
#neighborhood_overview, user_id, created_at, updated_at, nights_booked, description_locale

#Drops for variance (ANOVA testing, see article or presentation):zipcode, city, bed_type_category, 
#is_location_exact, instant_bookable, require_guest_profile_picture, require_guest_phone_verification, locale

#Drops where 80% or better of data is missing:is_business_travel_ready, weekly_price_native, monthly_price_native 

detailed_drops = ["status","id","price_native", "price_formatted", "picture_url", "thumbnail_url", "name",
                  "medium_url", "xl_picture_url", "security_deposit_formatted", "smart_location", "address",
                  "country","country_code","security_price_native", "is_business_travel_ready",
                  "listing_monthly_price_native","listing_weekly_price_native", "extras_price_native",
                  "listing_cleaning_fee_native","language", "Other", "cancel_policy_short_str", "description_locale",
                 "market", "state","currency_symbol_right", "license","square_feet", "map_image_url",
                  "has_viewed_terms","has_viewed_cleaning", "has_viewed_ib_perf_dashboard_panel","special_offer",
                  "toto_opt_in","wireless_info", "amenities", "bed_type", "public_address", "house_rules",
                  "description", "summary", "space", "access", "interaction", "loc.coordinates","airbnb_id",
                  "mashvisor_id", "localized_city", "public_address", "house_rules", "description", "summary",
                 "space", "access", "interaction", "transit", "neighborhood_overview", "host_id", "locale", "max_nights",
                  "weekly_price_native", "monthly_price_native", "listing_price_for_extra_person_native", "min_nights_input_value",
                 "max_nights_input_value", 'cancel_policy', "has_agreed_to_legal_terms", "updated_at", "created_at",
                  "property_type_id", "nights_booked", "lat", "lng", "user_id", "zipcode", "city", "bed_type_category",
                 "is_location_exact", "instant_bookable", "require_guest_profile_picture", "require_guest_phone_verification",
                 "translation missing: en.hosting_amenity_49", "translation missing: en.hosting_amenity_50"]
for detailed_drop in detailed_drops:
    if detailed_drop in detailed_listings_back.columns:
        detailed_listings_back = detailed_listings_back.drop(detailed_drop, axis = 1)

for i in detailed_listings_back.columns:
    if len(detailed_listings_back[i].unique()) == 1:
        if np.any(detailed_listings_back[i].isnull()) == False:
            detailed_listings_back = detailed_listings_back.drop(i, axis = 1)
        
detailed_listings_back

print("Dataset shape is " + str(detailed_listings_back.shape))
print(str(InitialMissingCells) + " null cells in initial dataset")
print(str(np.sum(detailed_listings_back.isnull().sum()))+ " in current dataset")
print(str(np.sum(detailed_listings_back.isnull().sum())/detailed_listings_back.shape[0]/detailed_listings_back.shape[1]) + "% of dataset is null")

Dataset shape is (1497, 72)
25057 null cells in initial dataset
400 in current dataset
0.003711125955614934% of dataset is null


# Imputation

##### Proportion of Missing Rows in Each Column (NaN/NA)

In [230]:
#from sklearn.model_selection import train_test_split
#from sklearn.impute import KNNImputer
#imputer=KNNImputer

In [231]:
#detailed_listings_back.bedrooms.hist()

In [232]:
#imputer.fit_transform(detailed_listings_back.drop(['rental_income','occupancy'], axis=1), detailed_listings_back.rental_income)
#dlb5.bedrooms.hist()

In [233]:
print(detailed_listings_back.isnull().sum()/detailed_listings_back.shape[0])
#print(X_train.isnull().sum()/X_train.shape[0])

price                            0.000000
bedrooms                         0.000668
beds                             0.000000
bathrooms                        0.000000
min_nights                       0.000000
neighborhood                     0.000000
person_capacity                  0.000000
cancellation_policy              0.000000
property_type                    0.000000
reviews_count                    0.000000
picture_count                    0.000000
check_in_time                    0.138945
check_out_time                   0.034736
guests_included                  0.000000
security_deposit_native          0.074816
star_rating                      0.000000
price_for_extra_person_native    0.000000
cleaning_fee_native              0.018036
instant_book_enabled             0.000000
listing_weekend_price_native     0.000000
monthly_price_factor             0.000000
weekly_price_factor              0.000000
occupancy                        0.000000
rental_income                    0

# Dummification

In [234]:
from sklearn.pipeline import Pipeline
categorical_columns = ['city', 'instant_bookable', 'neighborhood',
                      'zipcode','property_type','cancellation_policy',
                      'bed_type_category', 'require_guest_profile_picture',
                      'require_guest_phone_verification', 
                      'locale', 'property_type_id', 'description_locale',
                      'is_location_exact', 'cancel_policy_short_str',
                      'instant_book_enabled', 'user_id']
theres = []

for col in categorical_columns:
    if col in detailed_listings_back.columns:
        print(col)
        theres.append(col)
        #categorical_columns.remove(col)
X = pd.get_dummies(data=detailed_listings_back,columns = theres, prefix_sep = "__")
y_ri = list(detailed_listings_back["rental_income"])


neighborhood
property_type
cancellation_policy
instant_book_enabled


# Outlier Drops

In [235]:
#Dropping a handful of points in regions for which there was limited data improved model performance dramatically
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
#X['amCount'] = amenities_df.sum(axis=1)
X = X[detailed_listings_back.bedrooms < 5]
X = X[X.rental_income > 30]
X = X[X.rental_income < 10000]

In [236]:
X.to_pickle('Plotly-Dash/app.data')

In [237]:
detailed_listings_csv_data = X.to_csv('l2_detailed_listings.csv', index = False)
detailed_listings_csv_data = X.to_csv("Plotly-Dash/l2_detailed_listings.csv", index = False)

In [238]:
#y=X.rental_income
#X_r=X.drop("rental_income", axis=1)
X_train, X_test = train_test_split(X, test_size=0.2)

# Modeling

Scaling doesn't appear to offer a performance benefit over normalization

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.impute import KNNImputer
Rimputer = KNNImputer()
scaler = StandardScaler()

RidgeA_pipeline = Pipeline(steps=[('imputer',Rimputer),
                                  ('s', StandardScaler()), 
                                  ('R', Ridge(normalize=False))])
grid_search_ridge_pipeline = GridSearchCV(
       estimator=RidgeA_pipeline,
       cv = 10,
       param_grid=[{
           'R__alpha': np.linspace(.001,100,100)}],
       return_train_score=True,
       scoring= 'neg_mean_absolute_error', 
       verbose=0)

grid_search_ridge_pipeline=grid_search_ridge_pipeline.fit(X_train.drop(["rental_income","occupancy"], axis=1), X_train.rental_income)

In [None]:
from sklearn.metrics import r2_score

print('train MAE:',grid_search_ridge_pipeline.score(X_train.drop(["rental_income","occupancy"], axis=1),X_train.rental_income))
print('test MAE:',grid_search_ridge_pipeline.score(X_test.drop(["rental_income","occupancy"], axis=1),X_test.rental_income))
print("Train R2:" ,r2_score(X_train.rental_income, grid_search_ridge_pipeline.predict(X_train.drop(["rental_income","occupancy"], axis=1))))
print("Test R2:" ,r2_score(X_test.rental_income, grid_search_ridge_pipeline.predict(X_test.drop(["rental_income","occupancy"], axis=1))))

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
Rimputer = KNNImputer()


RidgeN_pipeline = Pipeline(steps=[('imputer',Rimputer),
                                  ('R', Ridge(normalize=True))])
grid_search_ridge_pipelineN = GridSearchCV(
       estimator=RidgeN_pipeline,
       cv = 10,
       param_grid=[{
           'R__alpha': np.linspace(.001,100,100)}],
       return_train_score=True,
       scoring= 'neg_mean_absolute_error', 
       verbose=0)

grid_search_ridge_pipelineN=grid_search_ridge_pipelineN.fit(X_train.drop(["rental_income","occupancy"], axis=1), X_train.rental_income)

In [None]:
print('train MAE:',grid_search_ridge_pipelineN.score(X_train.drop(["rental_income","occupancy"], axis=1),X_train.rental_income))
print('test MAE:',grid_search_ridge_pipelineN.score(X_test.drop(["rental_income","occupancy"], axis=1),X_test.rental_income))
print("Train R2:" ,r2_score(X_train.rental_income, grid_search_ridge_pipelineN.predict(X_train.drop(["rental_income","occupancy"], axis=1))))
print("Test R2:" ,r2_score(X_test.rental_income, grid_search_ridge_pipelineN.predict(X_test.drop(["rental_income","occupancy"], axis=1))))

In [101]:
import Models as mo
from sklearn.linear_model import LinearRegression
from imp import reload
reload(mo)

mob = mo.Models(LinearRegression(n_jobs=-1), 'Linear')
output = mob.performRegressions(X, drop = "occupancy")

alpha    Value:  0.001


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

In [None]:
print(output['Lasso'].modelCV.best_params_)
print(output['Ridge'].modelCV.best_params_)
print(output['Elastic Net'].modelCV.best_params_)
print(output['Random Forest'].modelCV.best_params_)
print(output['Gradient Boost'].modelCV.best_params_)
#print(output['SVM'].modelCV.best_params_)

In [116]:
from sklearn.impute import KNNImputer
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from mlxtend.regressor import StackingRegressor
imputer=KNNImputer()
mlr=LinearRegression()
ridge=Ridge()
lasso=Lasso()
ElasticNet=ElasticNet()
rf=RandomForestRegressor()
gb=GradientBoostingRegressor()
regressors= [ridge,lasso,ElasticNet,rf,gb]
stack=StackingRegressor(regressors=regressors,
                        meta_regressor=mlr,
                        use_features_in_secondary=True,
                           store_train_meta_features=True,)


params= {'lasso__alpha': np.linspace(.001,100,100),
         'ridge__alpha': np.linspace(.001,100,100),
         'ElasticNet__alpha': np.linspace(.001,100,100),
         'ElasticNet__l1_ratio': np.linspace(0, 1, 10),
         'rf__n_estimators': range(100, 1000, 300),
         'rf__max_features': ["auto", "sqrt", "log2"],
         'rf__max_depth': range(1,15,4),
         'gb__learning_rate':np.linspace(.001,.1,10),
         'gb__n_estimators': range(100, 1000, 300),
         "gb__max_features":["auto", "sqrt", "log2"],
         "gb__max_depth": range(1, 15, 4)}


                              
pipeline_ri=Pipeline([('imputer', imputer),
                     ('stack', stack)])
grid_search_stack = GridSearchCV(
       estimator=pipeline_ri,
       cv = 5,
       param_grid=params,
       return_train_score=True,
       scoring= 'neg_mean_absolute_error', 
       verbose=2)
                              

In [117]:
grid_search_stack=grid_search_stack.fit(X.drop(["rental_income","occupancy"], axis=1), X.rental_income)

import pickle
filename = 'Plotly-Dash/finalized_model_ri.sav'
pickle.dump(grid_search_stack, open(filename, 'wb'))

MemoryError: 

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
print('train MAE:', mean_absolute_error(y_train, grid_search_stack.predict(X_train.drop("occupancy", axis=1))))
print('test MAE:', mean_absolute_error(y_test, grid_search_stack.predict(X_test.drop("occupancy", axis=1))))    
print("Train R2:" ,r2_score(y_train, grid_search_stack.predict(X_train.drop("occupancy", axis=1))))
print("Test R2:" ,r2_score(y_test, grid_search_stack.predict(X_test.drop("occupancy", axis=1))))

In [None]:
grid_search_stack.best_params_


In [None]:
import pickle
filename = 'Plotly-Dash/finalized_model_ri.sav'
pickle.dump(stack, open(filename, 'wb'))

In [None]:
import Models as mo
from sklearn.linear_model import LinearRegression
from imp import reload
reload(mo)

mobO = mo.Models(LinearRegression(n_jobs=-1), 'Linear')
outputO = mobO.performRegressions(X, target = 'occupancy' , drop = "rental_income")

In [None]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from mlxtend.regressor import StackingCVRegressor
stackO = StackingCVRegressor(
       regressors=(LinearRegression(), 
              Lasso(**outputO['Lasso'].modelCV.best_params_), 
              Ridge(**outputO['Ridge'].modelCV.best_params_),
              ElasticNet(**outputO['Elastic Net'].modelCV.best_params_)),
              RandomForestRegressor(**outputO['Random Forest'].modelCV.best_params_),
              GradientBoostingRegressor(**outputO['Gradient Boost'].modelCV.best_params_)),
              #SVR(**outputO['SVM'].modelCV.best_params_),
       meta_regressor=Lasso(), 
       use_features_in_secondary=True,
       store_train_meta_features=True,
       shuffle=False,
       random_state=1)

In [None]:
grid_search_stackO = GridSearchCV(
       estimator=stackO,
       cv = 5,
       param_grid=[{
           'meta_regressor': [Lasso(), Ridge(), ElasticNet(), RandomForestRegressor(), GradientBoostingRegressor()]}],
       return_train_score=True,
       scoring= 'neg_mean_absolute_error', 
       verbose=0)

grid_search_stackO=grid_search_stackO.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
print('train MAE:', mean_absolute_error(X_train.occupancy, stackO.predict(X_train.drop("occupancy", axis=1))))
print('test MAE:', mean_absolute_error(X_test.occupancy, stackO.predict(X_test.drop("occupancy", axis=1))))    
print("Train R2:" ,r2_score(X_train.occupancy, stackO.predict(X_train.drop("occupancy", axis=1))))
print("Test R2:" ,r2_score(X_test.occupancy, stackO.predict(X_test.drop("occupancy", axis=1))))

In [None]:
import Models as mo
from sklearn.linear_model import LinearRegression
from imp import reload
reload(mo)

mobO = mo.Models(LinearRegression(n_jobs=-1), 'Linear')
outputO = mobO.performRegressions(result, target = 'listings_price_weekend_native' , drop = "rental_income")

In [None]:
import pickle
filenameO = 'Plotly-Dash/finalized_model_O.sav'
pickle.dump(stackO, open(filenameO, 'wb'))

# Feature Importance

In [None]:
importance = output['Random Forest'].model.feature_importances_
# summarize feature importance
for i,v in enumerate(importance):
    print('Feature: %0d, Score: %.5f' % (i,v))


In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
sfsri = SFS(stack, 
           k_features=(6 ),#result.shape[1]-2), 
           forward=True, 
           floating=False, 
           verbose=2,
           scoring='neg_mean_absolute_error',
           cv=2)
sfsri.fit(result.drop(['rental_income','occupancy'], axis = 1), y)
ranking = sfsri.subsets_

In [None]:
ranking

In [None]:
ammenity_ranking = [n for n in ranking[3]['feature_names'] if n in amenities_df.columns]

In [None]:
ammenity_ranking

In [None]:
yO = result['occupancy']
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
sfsO = SFS(stackO, 
           k_features=(25 ),#result.shape[1]-2), 
           #forward=True, 
           floating=False, 
           verbose=2,
           scoring='neg_mean_absolute_error',
           cv=0)
sfsO.fit(result.drop(['rental_income','occupancy'], axis = 1), y)
rankingO = sfsO.subsets_

In [None]:
rankingO

In [None]:
ammenity_rankingO = [n for n in rankingO[3]['feature_names'] if n in amenities_df.columns]

In [None]:
from scipy import stats

y_train_bx,fitted_lambda = stats.boxcox(y_train)
y_test_bx = stats.boxcox(y_test, fitted_lambda)


In [None]:
Lasso = Lasso(alpha = 1,tol = 0.00001, max_iter = 10000, normalize = True)
grid_search_lasso = GridSearchCV(
       estimator=Lasso,
       cv = 10,
       param_grid=[{
           'alpha': np.linspace(.001,1,100)}],
       return_train_score=True,
       scoring= 'neg_mean_absolute_error', 
       verbose=0)

grid_search_lasso_fit=grid_search_lasso.fit(X_train, y_train)

In [None]:
from sklearn.metrics import mean_absolute_error
print('train MAE:',grid_search_lasso_fit.score(X_train, y_train),'  test MAE:',grid_search_lasso_fit.score(X_test, y_test))
print("Train R2:" ,r2_score(y_train, grid_search_lasso_fit.predict(X_train)))
print("Test R2:" ,r2_score(y_test, grid_search_lasso_fit.predict(X_test)))

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(loss = "lad")

grid_para_tree = [{
    "n_estimators": range(100,1000,3)}]

grid_search_gbr = GridSearchCV(gbr, grid_para_tree, cv=2, scoring= 'neg_mean_squared_error', n_jobs=-1)
grid_search_gbr = grid_search_gbr.fit(X_train, y_train)


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
param_dict = {'loss': "lad"}
grid_para_tree = [{
    "n_estimators": range(100,1000,3),
    "max_features":["auto", "sqrt", "log2"],
    "max_depth": range(1, 15, 4),
    "min_samples_split": range(2,50,3)}]

for params in param_dict[0]:
    gbr = GradientBoostingRegressor()
    gbr.set_params(**param_dict)
    print(gbr.get_params())
    gbr.set_params(**params)
    print(gbr.get_params())
    
#for params in grid_para_tree.values():
#    print(params)
#grid_para_tree[0]

In [None]:
print('train MSE w/ BoxCox:',grid_search_gbr.score(X_train, y_train_bx),'  test MSE w/ BoxCox:',grid_search_gbr.score(X_test, y_test_bx))
print('train MSE:',grid_search_gbr.score(X_train, y_train),'  test MSE:',grid_search_gbr.score(X_test, y_test))
print('train MAE:', mean_absolute_error(y_train, grid_search_gbr.predict(X_train)))
print('test MAE:', mean_absolute_error(y_test, grid_search_gbr.predict(X_test)))      
print("train R2:" ,r2_score(y_train, grid_search_gbr.predict(X_train)))
print("test R2:" ,r2_score(y_test, grid_search_gbr.predict(X_test)))
print(grid_search_gbr.best_params_)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(loss = "lad")
gbr.set_params(**grid_search_gbr.best_params_)

grid_para_tree = [{mklbub
    "max_features":["auto", "sqrt", "log2"]}]

grid_search_gbr_feat = GridSearchCV(gbr, grid_para_tree, cv=2, scoring= 'neg_mean_absolute_error', n_jobs=-1)
grid_search_gbr_feat = grid_search_gbr_feat.fit(X_train, y_train)

In [None]:
print('train MSE w/ BoxCox:',grid_search_gbr_feat.score(X_train, y_train_bx),'  test MSE w/ BoxCox:',grid_search_gbr_feat.score(X_test, y_test_bx))
print('train MSE:',grid_search_gbr_feat.score(X_train, y_train),'  test MSE:',grid_search_gbr_feat.score(X_test, y_test))
print('train MAE:', mean_absolute_error(y_train, grid_search_gbr_feat.predict(X_train)))
print('test MAE:', mean_absolute_error(y_test, grid_search_gbr_feat.predict(X_test)))      
print("train R2:" ,r2_score(y_train, grid_search_gbr_feat.predict(X_train)))
print("test R2:" ,r2_score(y_test, grid_search_gbr_feat.predict(X_test)))
print(grid_search_gbr_feat.best_estimator_)

In [None]:
grid_search_gbr_feat.best_estimator_

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(loss = "lad")
gbr.set_params(**grid_search_gbr.best_params_)
gbr.set_params(**grid_search_gbr_feat.best_params_)
grid_para_tree = [{
    "max_depth": range(1, 15, 4)
}]

grid_search_gbr_depth = GridSearchCV(gbr, grid_para_tree, cv=2, scoring= 'neg_mean_squared_error', n_jobs=-1)
grid_search_gbr_depth = grid_search_gbr_depth.fit(X_train, y_train)

In [None]:
print('train MSE w/ BoxCox:',grid_search_gbr_depth.score(X_train, y_train_bx),'  test MSE w/ BoxCox:',grid_search_gbr_depth.score(X_test, y_test_bx))
print('train MSE:',grid_search_gbr_depth.score(X_train, y_train),'  test MSE:',grid_search_gbr_depth.score(X_test, y_test))
print('train MAE:', mean_absolute_error(y_train, grid_search_gbr_depth.predict(X_train)))
print('test MAE:', mean_absolute_error(y_test, grid_search_gbr_depth.predict(X_test)))      
print("train R2:" ,r2_score(y_train, grid_search_gbr_depth.predict(X_train)))
print("test R2:" ,r2_score(y_test, grid_search_gbr_depth.predict(X_test)))
print(grid_search_gbr_depth.best_estimator_)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(loss = "lad")
gbr.set_params(**grid_search_gbr.best_params_)
gbr.set_params(**grid_search_gbr_feat.best_params_)
gbr.set_params(**grid_search_gbr_depth.best_params_)
print(gbr.get_params_)
grid_para_tree = [{
    "min_samples_split": range(2,50,3)
}]

grid_search_gbr_split = GridSearchCV(gbr, grid_para_tree, cv=5, scoring= 'neg_mean_squared_error', n_jobs=-1)
grid_search_gbr_split = grid_search_gbr.split.fit(X_train, y_train)

In [None]:
print('train MSE w/ BoxCox:',grid_search_gbr_split.score(X_train, y_train_bx),'  test MSE w/ BoxCox:',grid_search_gbr_split.score(X_test, y_test_bx))
print('train MSE:',grid_search_gbr_split.score(X_train, y_train),'  test MSE:',grid_search_gbr_split.score(X_test, y_test))
print('train MAE:', mean_absolute_error(y_train, grid_search_gbr_split.predict(X_train)))
print('test MAE:', mean_absolute_error(y_test, grid_search_gbr_split.predict(X_test)))      
print("train R2:" ,r2_score(y_train, grid_search_gbr_split.predict(X_train)))
print("test R2:" ,r2_score(y_test, grid_search_gbr_split.predict(X_test)))
print(grid_search_gbr_split.best_estimator_)

In [None]:
#James suggests running a couple between 100 and 1000 coursely to determine whether CV is necessary
#Not using MAE as criterion because sklearn RF runs in NO^2 time which is prohibitive
rf = RandomForestRegressor(n_estimators = 100)
grid_para_tree = [{
    "n_estimators": range(100,1000,3)}]

grid_search_rf = GridSearchCV(rf, grid_para_tree, cv=2, scoring='mae', n_jobs=-1)
grid_search_rf = grid_search_rf.fit(X_train, y_train)

In [None]:
print('train MSE w/ BoxCox:',grid_search_rf.score(X_train, y_train_bx),'  test MSE w/ BoxCox:',grid_search_rf.score(X_test, y_test_bx))
print('train MSE:',grid_search_rf.score(X_train, y_train),'  test MSE:',grid_search_rf.score(X_test, y_test))
print('train MAE:', mean_absolute_error(y_train, grid_search_rf.predict(X_train)))
print('test MAE:', mean_absolute_error(y_test, grid_search_rf.predict(X_test)))      
print("train R2:" ,r2_score(y_train, grid_search_rf.predict(X_train)))
print("test R2:" ,r2_score(y_test, grid_search_rf.predict(X_test)))
print(grid_search_rf.best_params_)

In [None]:
print('train MAE:',grid_search_rf.score(X_train, y_train),'  test MAE:',grid_search_rf.score(X_test, y_test))
#print('train MAE:',mean_absolute_error(grid_search_rf.predict(X_train, y_train),ytrain),'  test MAE:',mean_absolute_error(grid_search_rf.predict(X_test, y_test), y_test)

In [None]:
rf = RandomForestRegressor(n_estimators = 100, criterion = "mae")
grid_para_tree = [{
    "max_depth": range(1, 15, 4),
    "min_samples_split": range(2,50,3),
    }]
#tree_model.set_params(random_state=34)
grid_search_rf = GridSearchCV(rf, grid_para_tree, cv=5, scoring='mae', n_jobs=-1)
grid_search_rf = grid_search_rf.fit(X_train, y_train)

In [None]:
print('train MAE:',grid_search_rf.score(X_train, y_train),'  test MAE:',grid_search_rf.score(X_test, y_test))

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
Lasso = Lasso(alpha = 1,tol = 0.00001, max_iter = 10000, normalize = True)
grid_search_lasso = GridSearchCV(
       estimator=Lasso,
       cv = 10,
       param_grid=[{
           'alpha': np.linspace(.001,1,100)}],
       return_train_score=True,
       scoring= 'neg_mean_absolute_error', 
       verbose=0)

grid_search_lasso_fit_ri=grid_search_lasso.fit(X_train, y_train_ri)

In [None]:

print('train MAE:',grid_search_lasso_fit.score(X_train_ri, y_train_ri),'  test MAE:',grid_search_lasso_fit.score(X_test_ri, y_test_ri))
print("Train R2:" ,r2_score(y_train_ri, grid_search_lasso_fit_ri.predict(X_train_ri)))
print("Test R2:" ,r2_score(y_test_ri, grid_search_lasso_fit_ri.predict(X_test_ri)))

In [None]:
Ridge = Ridge(alpha = 1,tol = 0.00001, max_iter = 10000, normalize = True)
grid_search_ridge = GridSearchCV(
       estimator=Ridge,
       cv = 10,
       param_grid=[{
           'alpha': np.linspace(.001,1,100)}],
       return_train_score=True,
       scoring= 'neg_mean_absolute_error', 
       verbose=0)

grid_search_ridge_ri=grid_search_ridge.fit(X_train_ri, y_train_ri)

In [None]:
print('train MAE:',grid_search_ridge_ri.score(X_train_ri, y_train_ri),'  test MAE:',grid_search_ridge_ri.score(X_test_ri, y_test_ri))
print("Train R2:" ,r2_score(y_train_ri, grid_search_ridge_ri.predict(X_train_ri)))
print("Test R2:" ,r2_score(y_test_ri, grid_search_ridge_ri.predict(X_test_ri)))

In [None]:
#James suggests running a couple between 100 and 1000 coursely to determine whether CV is necessary
rf = RandomForestRegressor(n_estimators = 100)
grid_para_tree = [{
    "n_estimators": range(100,1000,3)}]
    #"max_features":["auto", "sqrt", "log2"]}]
#tree_model.set_params(random_state=34)
grid_search_rf_ri = GridSearchCV(rf, grid_para_tree, cv=1, scoring='mae', n_jobs=-1)
grid_search_rf_ri = grid_search_rf.fit(X_train_ri, y_train_ri)

In [None]:
print('train MAE:',grid_search_rf_ri.score(X_train_ri, y_train_ri),'  test MAE:',grid_search_rf_ri.score(X_test_ri, y_test_ri))
print("Train R2:" ,r2_score(y_rf_ri, grid_search_rf_ri.predict(X_train_ri)))
print("Test R2:" ,r2_score(y_rf_ri, grid_search_rf_ri.predict(X_test_ri)))

In [None]:
pip install rpy2

In [None]:
import rpy2.robjects.lib.ggplot2 as ggplot2
import rpy2.robjects.lib.pdp as pdp
import rpy2.robjects as ro
from rpy2.robjects.packages import importr
base = importr('base')
    #partial(stack, pred.var = "", plot = TRUE, train = X,  plot.engine = "ggplot2")

