# This notebook will select key features that have a high correlation to income

This notebook will go over:
- Feature engineering
- Creating Dummy Variables for cities
- Using Elastic Net as EDA --> decided to not use this model to pick features
- Using SelectKBest to select optimal features --> only picked cities thus, we won't be using this 
- Using for-loop to find optimal features --> utilizing this method to model
- Cross val score before PCA
- Cross Val Score of models to determine which models to GridSearch (PCA)
- GridSearch top performing models
- Transform demo model to prep for testing (on Flask)
- Save model in pickle for flask
- Predicting income using demo model to make sure flask is performing correctly

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./datasets/Grandma.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,zip_code,count,latitude,longitude,Price_1,Price_2,Price_3,Price_4,Rating_1,...,wineries,winetasteclasses,winetastingroom,winetours,womenscloth,wraps,yelpevents,yoga,zoos,income
0,0,90001,30,33.97653,-118.24923,18,12,0,0,0,...,0,0,0,0,0,0,0,0,0,35660
1,1,90002,4,33.948102,-118.248582,3,1,0,0,0,...,0,0,0,0,0,0,0,0,0,34000
2,2,90003,20,33.971906,-118.272539,16,4,0,0,0,...,0,0,0,0,0,0,0,0,0,34397
3,3,90004,29,34.076563,-118.310331,10,19,0,0,0,...,0,0,0,0,0,0,0,0,0,46581
4,4,90005,45,34.059083,-118.298835,11,33,1,0,0,...,0,0,0,0,0,0,0,0,0,32461


# Feature Engineering Price

In [3]:
df['price_1_percentage'] = df['Price_1']/ df['count']

In [4]:
#sanity check if it worked 
df.head()

Unnamed: 0.1,Unnamed: 0,zip_code,count,latitude,longitude,Price_1,Price_2,Price_3,Price_4,Rating_1,...,winetasteclasses,winetastingroom,winetours,womenscloth,wraps,yelpevents,yoga,zoos,income,price_1_percentage
0,0,90001,30,33.97653,-118.24923,18,12,0,0,0,...,0,0,0,0,0,0,0,0,35660,0.6
1,1,90002,4,33.948102,-118.248582,3,1,0,0,0,...,0,0,0,0,0,0,0,0,34000,0.75
2,2,90003,20,33.971906,-118.272539,16,4,0,0,0,...,0,0,0,0,0,0,0,0,34397,0.8
3,3,90004,29,34.076563,-118.310331,10,19,0,0,0,...,0,0,0,0,0,0,0,0,46581,0.344828
4,4,90005,45,34.059083,-118.298835,11,33,1,0,0,...,0,0,0,0,0,0,0,0,32461,0.244444


In [5]:

df['price_2_percentage'] = df['Price_2']/ df['count']
df['price_3_percentage'] = df['Price_3']/ df['count']
df['price_4_percentage'] = df['Price_4']/ df['count']

In [6]:
df.head()

Unnamed: 0.1,Unnamed: 0,zip_code,count,latitude,longitude,Price_1,Price_2,Price_3,Price_4,Rating_1,...,womenscloth,wraps,yelpevents,yoga,zoos,income,price_1_percentage,price_2_percentage,price_3_percentage,price_4_percentage
0,0,90001,30,33.97653,-118.24923,18,12,0,0,0,...,0,0,0,0,0,35660,0.6,0.4,0.0,0.0
1,1,90002,4,33.948102,-118.248582,3,1,0,0,0,...,0,0,0,0,0,34000,0.75,0.25,0.0,0.0
2,2,90003,20,33.971906,-118.272539,16,4,0,0,0,...,0,0,0,0,0,34397,0.8,0.2,0.0,0.0
3,3,90004,29,34.076563,-118.310331,10,19,0,0,0,...,0,0,0,0,0,46581,0.344828,0.655172,0.0,0.0
4,4,90005,45,34.059083,-118.298835,11,33,1,0,0,...,0,0,0,0,0,32461,0.244444,0.733333,0.022222,0.0


# Feature Engineering Ratings

In [7]:
ratings_col = [col for col in df if col.startswith('Rating')]
ratings_col

['Rating_1',
 'Rating_1.5',
 'Rating_2',
 'Rating_2.5',
 'Rating_3',
 'Rating_3.5',
 'Rating_4',
 'Rating_4.5',
 'Rating_5']

In [8]:
df['rate_1_percentage'] = df['Rating_1']/ df['count']
df['rate_1.5_percentage'] = df['Rating_1.5']/ df['count']
df['rate_2_percentage'] = df['Rating_2']/ df['count']

df['rate_2.5_percentage'] = df['Rating_2.5']/ df['count']
df['rate_3_percentage'] = df['Rating_3']/ df['count']
df['rate_3.5_percentage'] = df['Rating_3.5']/ df['count']

df['rate_4_percentage'] = df['Rating_4']/ df['count']
df['rate_4.5_percentage'] = df['Rating_4.5']/ df['count']
df['rate_5_percentage'] = df['Rating_5']/ df['count']

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,zip_code,count,latitude,longitude,Price_1,Price_2,Price_3,Price_4,Rating_1,...,price_4_percentage,rate_1_percentage,rate_1.5_percentage,rate_2_percentage,rate_2.5_percentage,rate_3_percentage,rate_3.5_percentage,rate_4_percentage,rate_4.5_percentage,rate_5_percentage
0,0,90001,30,33.97653,-118.24923,18,12,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.2,0.266667,0.433333,0.1
1,1,90002,4,33.948102,-118.248582,3,1,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.75,0.25,0.0
2,2,90003,20,33.971906,-118.272539,16,4,0,0,0,...,0.0,0.0,0.0,0.0,0.05,0.05,0.15,0.35,0.35,0.05
3,3,90004,29,34.076563,-118.310331,10,19,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.034483,0.655172,0.206897,0.103448
4,4,90005,45,34.059083,-118.298835,11,33,1,0,0,...,0.0,0.0,0.0,0.0,0.0,0.022222,0.133333,0.511111,0.333333,0.0


# Creating Dummy Variables for Cities

In [10]:
df = pd.get_dummies(df, columns = ['city'])

In [11]:
df.head()

Unnamed: 0.1,Unnamed: 0,zip_code,count,latitude,longitude,Price_1,Price_2,Price_3,Price_4,Rating_1,...,city_Yorba Linda,city_Yosemite National Park,city_Yountville,city_Yreka,city_Yuba City,city_Yucaipa,city_Yucca Valley,city_Zamora,city_Zillah,city_carlsbad
0,0,90001,30,33.97653,-118.24923,18,12,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,90002,4,33.948102,-118.248582,3,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,90003,20,33.971906,-118.272539,16,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,90004,29,34.076563,-118.310331,10,19,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,90005,45,34.059083,-118.298835,11,33,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
df.to_csv('./datasets/ALLMOTHERS.csv')

# Let's select key features we want to feed in our Elastic Net Model to determine best optimal features for other regression models
- This is called modeling as EDA

In [13]:
#category list
categories = list(df.loc[:, 'acaibowls':'zoos'].columns)

In [14]:
len(categories)

468

In [15]:
#creating a list of all percentage columns (should be 13 columns total)
percentage_col = [col for col in df if col.endswith('percentage')]
percentage_col

['price_1_percentage',
 'price_2_percentage',
 'price_3_percentage',
 'price_4_percentage',
 'rate_1_percentage',
 'rate_1.5_percentage',
 'rate_2_percentage',
 'rate_2.5_percentage',
 'rate_3_percentage',
 'rate_3.5_percentage',
 'rate_4_percentage',
 'rate_4.5_percentage',
 'rate_5_percentage']

In [16]:
len(percentage_col)

13

In [17]:
#creating list of cities (should be 1547)
cities_col = [col for col in df if col.startswith('city')]
cities_col

['city_Aberdeen',
 'city_Acampo',
 'city_Acme',
 'city_Acton',
 'city_Adin',
 'city_Agoura Hills',
 'city_Agua Dulce',
 'city_Aguanga',
 'city_Ahwahnee',
 'city_Aiea',
 'city_Airway Heights',
 'city_Alameda',
 'city_Alamo',
 'city_Albany',
 'city_Albion',
 'city_Algona',
 'city_Alhambra',
 'city_Aliso Viejo',
 'city_Allyn',
 'city_Alpine',
 'city_Alsea',
 'city_Alturas',
 'city_Alviso',
 'city_Amanda Park',
 'city_Amboy',
 'city_American Canyon',
 'city_Amity',
 'city_Anacortes',
 'city_Anaheim',
 'city_Anahola',
 'city_Anchor Point',
 'city_Anchorage',
 'city_Anderson',
 'city_Anderson Island',
 'city_Angels Camp',
 'city_Angwin',
 'city_Antelope',
 'city_Antioch',
 'city_Anza',
 'city_Apple Valley',
 'city_Applegate',
 'city_Aptos',
 'city_Arbuckle',
 'city_Arcadia',
 'city_Arcata',
 'city_Arlington',
 'city_Armona',
 'city_Arnold',
 'city_Aromas',
 'city_Arroyo Grande',
 'city_Artesia',
 'city_Arvin',
 'city_Ashford',
 'city_Ashland',
 'city_Astoria',
 'city_Atascadero',
 'city_Athe

In [18]:
len(cities_col)

1547

In [19]:
#miscellaneous columns I want to keep that are VERY important
features  = ['zip_code', 'count', 'latitude', 'longitude', 'income']
len(features)

5

# 4 Main Lists that we want to use for our elastic net model
- categories
- percentage_col
- cities_col
- features

In [20]:
main_features = categories + percentage_col + cities_col + features
len(main_features)

2033

In [21]:
#sanity check 
len(set(main_features))

2033

# Creating dataframe with key numeric variables to prep for Elastic Net

In [22]:
elastic_df = df[main_features]
elastic_df.shape

(2373, 2033)

In [23]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [24]:
#creating X and y
X = elastic_df.drop(labels = 'income', axis = 'columns')
y = elastic_df['income']

In [25]:
X.shape

(2373, 2032)

In [26]:
y.shape

(2373,)

# Elastic Gridsearch Time

In [27]:
pipe = Pipeline([
    ('e', ElasticNet())
])

pipe_params = {
    'e__alpha': [0, 0.05, 0.01],
    'e__l1_ratio': [0.92, 0.1, 0.8, 0.5]
   
}


gs = GridSearchCV(pipe, param_grid=pipe_params, cv=5, n_jobs = 3)


gs.fit(X, y)


print(gs.best_score_)
gs.best_params_



-0.21178597695682908


{'e__alpha': 0.01, 'e__l1_ratio': 0.8}

In [28]:
elastic = ElasticNet(alpha= 0.01, l1_ratio= 0.8)

In [29]:
elastic.fit(X, y)

ElasticNet(alpha=0.01, copy_X=True, fit_intercept=True, l1_ratio=0.8,
           max_iter=1000, normalize=False, positive=False, precompute=False,
           random_state=None, selection='cyclic', tol=0.0001, warm_start=False)

In [30]:
coefficients = list(elastic.coef_)

In [31]:
cols = list(X.columns)

In [32]:
elastic_pd = pd.DataFrame()

In [33]:
elastic_pd['coef'] = coefficients
elastic_pd['column_name'] = cols

In [34]:
elastic_pd.shape

(2032, 2)

In [35]:
elastic_pd.head()

Unnamed: 0,coef,column_name
0,4805.956288,acaibowls
1,-485.943412,accessories
2,1327.828508,active
3,-122.776354,acupuncture
4,-1562.857229,adultedu


In [36]:
elastic_pd.sort_values(by=['coef'])

Unnamed: 0,coef,column_name
468,-16206.583718,price_1_percentage
1669,-13142.693619,city_San Bernardino
1348,-11411.357820,city_Miramonte
1263,-11365.316436,city_Los Angeles
431,-11102.022721,ukrainian
1685,-10751.580316,city_San Luis Obispo
954,-10659.944423,city_Fresno
243,-10479.802176,landmarks
1919,-10035.413986,city_Vernon
1912,-9733.292206,city_Van Nuys


### Conclusion: We will not use ElasticNet to help us pick optimal features

# Use SelectKBest- Bad

In [37]:
from sklearn.feature_selection import SelectKBest
#from sklearn.feature_selection import chi2


selector = SelectKBest(k = 48) #since sqrt 2373 is 48.blahblahblah
selector.fit(X, y)

#X_new = selector.transform(X)
#print(X_new.shape)




#2nd way
X.columns[selector.get_support(indices=True)].tolist()

  f = msb / msw
  f = msb / msw


['city_White Salmon',
 'city_White Swan',
 'city_Whitethorn',
 'city_Wildomar',
 'city_Wilkeson',
 'city_Willits',
 'city_Willow',
 'city_Willow Creek',
 'city_Willows',
 'city_Wilmington',
 'city_Wilsonville',
 'city_Wilton',
 'city_Winchester',
 'city_Windsor',
 'city_Winlock',
 'city_Winston',
 'city_Winterhaven',
 'city_Winters',
 'city_Winton',
 'city_Wofford Heights',
 'city_Wolf Creek',
 'city_Wood Village',
 'city_Woodacre',
 'city_Woodbridge',
 'city_Woodburn',
 'city_Woodinville',
 'city_Woodlake',
 'city_Woodland',
 'city_Woodland Hills',
 'city_Woodside',
 'city_Wrangell',
 'city_Wrightwood',
 'city_Yachats',
 'city_Yakima',
 'city_Yamhill',
 'city_Yelm',
 'city_Yermo',
 'city_Yoncalla',
 'city_Yorba Linda',
 'city_Yosemite National Park',
 'city_Yountville',
 'city_Yreka',
 'city_Yuba City',
 'city_Yucaipa',
 'city_Yucca Valley',
 'city_Zamora',
 'city_Zillah',
 'city_carlsbad']

### Conclusion: We will not use SelectKBest to help us pick optimal features

# 4 Main Lists of Features
- categories
- percentage_col
- cities_col
- features

# Finding categories first

In [38]:
len(categories)

468

In [39]:
'income' in df

True

In [40]:
#find all categories where correlation is higher than 0.15
pos_corr_categories = []
for col in categories:
    corr = df['income'].corr(df[col])
    if corr> 0.15:
        pos_corr_categories.append(col)

In [116]:
pos_corr_categories

['acaibowls',
 'asianfusion',
 'bakeries',
 'breakfast_brunch',
 'bubbletea',
 'cafes',
 'coffee',
 'cupcakes',
 'desserts',
 'french',
 'gluten_free',
 'greek',
 'indpak',
 'italian',
 'japanese',
 'mediterranean',
 'mideastern',
 'newamerican',
 'persian',
 'pizza',
 'salad',
 'sandwiches',
 'seafood',
 'sushi',
 'thai',
 'turkish',
 'vegetarian',
 'vietnamese',
 'wine_bars']

In [42]:
#find all categories where correlation is less than -0.05
neg_corr_categories = []
for col in categories:
    corr = df['income'].corr(df[col])
    if corr< -0.05:
        neg_corr_categories.append(col)

In [43]:
neg_corr_categories

['casinos', 'galleries', 'restaurants', 'salvadoran', 'soulfood']

In [44]:
neg_corr_categories.remove('restaurants')

In [45]:
neg_corr_categories

['casinos', 'galleries', 'salvadoran', 'soulfood']

- `pos_corr_categories`
- `neg_corr_categories`

# Finding Percentage Col Second

In [46]:
#same method for positive percentage
pos_corr_percentage = []
for col in percentage_col:
    corr = df['income'].corr(df[col])
    if corr> 0.1:
        pos_corr_percentage.append(col)

In [47]:
pos_corr_percentage

['price_2_percentage', 'price_3_percentage', 'rate_4_percentage']

In [48]:
#same method for negative percentage
neg_corr_percentage = []
for col in percentage_col:
    corr = df['income'].corr(df[col])
    if corr< -0.1:
        neg_corr_percentage.append(col)

In [49]:
neg_corr_percentage

['price_1_percentage', 'rate_2_percentage']

- `pos_corr_percentage`
- `neg_corr_percentage`

# Finding cities third 

In [50]:
#positive correlation for cities
pos_corr_cities = []
for col in cities_col:
    corr = df['income'].corr(df[col])
    if corr> 0.05:
        pos_corr_cities.append(col)


In [51]:
pos_corr_cities

['city_Alamo',
 'city_Bellevue',
 'city_Beverly Hills',
 'city_Carlsbad',
 'city_Corona Del Mar',
 'city_Cupertino',
 'city_Danville',
 'city_Fremont',
 'city_Irvine',
 'city_Issaquah',
 'city_Kensington',
 'city_La Cañada Flintridge',
 'city_Ladera Ranch',
 'city_Los Altos',
 'city_Los Gatos',
 'city_Malibu',
 'city_Manhattan Beach',
 'city_Menlo Park',
 'city_Mercer Island',
 'city_Moraga',
 'city_Moss Beach',
 'city_Newport Beach',
 'city_Orinda',
 'city_Pacific Palisades',
 'city_Palo Alto',
 'city_Pleasanton',
 'city_Port Costa',
 'city_Portola Valley',
 'city_Redwood City',
 'city_Rolling Hills Estate',
 'city_Ross',
 'city_Sammamish',
 'city_San Francisco',
 'city_San Jose',
 'city_San Marino',
 'city_San Mateo',
 'city_San Ramon',
 'city_Santa Clara',
 'city_Saratoga',
 'city_Seattle',
 'city_Sierra City',
 'city_Sunnyvale',
 'city_Tiburon',
 'city_Trabuco Canyon',
 'city_Villa Park',
 'city_Woodinville',
 'city_Woodside',
 'city_Yorba Linda']

In [52]:
#negative correlation for cities
neg_corr_cities = []
for col in cities_col:
    corr = df['income'].corr(df[col])
    if corr< -0.03:
        neg_corr_cities.append(col)

In [53]:
neg_corr_cities

['city_Avery',
 'city_Bodfish',
 'city_Clearlake',
 'city_Desert Hot Springs',
 'city_Detroit',
 'city_El Monte',
 'city_Fresno',
 'city_Hemet',
 'city_Inglewood',
 'city_Los Angeles',
 'city_Marblemount',
 'city_Miramonte',
 'city_Oroville',
 'city_San Bernardino',
 'city_Sierraville',
 'city_Skamokawa',
 'city_Slab City',
 'city_Spokane',
 'city_Stockton',
 'city_Sumpter',
 'city_Thermal',
 'city_Vernon',
 'city_Wilderville']

- `pos_corr_cities`
- `neg_corr_cities`

### How about count of businesses?

In [54]:
df['income'].corr(df['count'])

0.11476383993636025

In [55]:
misc = ['count']

- `misc`

In [56]:
main_list = pos_corr_categories + pos_corr_cities + pos_corr_percentage + neg_corr_categories + neg_corr_cities + neg_corr_percentage + misc

In [57]:
#sanity
len(main_list)

110

In [58]:
#sanity
len(set(main_list))

110

In [59]:
#list of optimal features we will use 
main_list

['acaibowls',
 'asianfusion',
 'bakeries',
 'breakfast_brunch',
 'bubbletea',
 'cafes',
 'coffee',
 'cupcakes',
 'desserts',
 'french',
 'gluten_free',
 'greek',
 'indpak',
 'italian',
 'japanese',
 'mediterranean',
 'mideastern',
 'newamerican',
 'persian',
 'pizza',
 'salad',
 'sandwiches',
 'seafood',
 'sushi',
 'thai',
 'turkish',
 'vegetarian',
 'vietnamese',
 'wine_bars',
 'city_Alamo',
 'city_Bellevue',
 'city_Beverly Hills',
 'city_Carlsbad',
 'city_Corona Del Mar',
 'city_Cupertino',
 'city_Danville',
 'city_Fremont',
 'city_Irvine',
 'city_Issaquah',
 'city_Kensington',
 'city_La Cañada Flintridge',
 'city_Ladera Ranch',
 'city_Los Altos',
 'city_Los Gatos',
 'city_Malibu',
 'city_Manhattan Beach',
 'city_Menlo Park',
 'city_Mercer Island',
 'city_Moraga',
 'city_Moss Beach',
 'city_Newport Beach',
 'city_Orinda',
 'city_Pacific Palisades',
 'city_Palo Alto',
 'city_Pleasanton',
 'city_Port Costa',
 'city_Portola Valley',
 'city_Redwood City',
 'city_Rolling Hills Estate',
 '

In [60]:
df.head()

Unnamed: 0.1,Unnamed: 0,zip_code,count,latitude,longitude,Price_1,Price_2,Price_3,Price_4,Rating_1,...,city_Yorba Linda,city_Yosemite National Park,city_Yountville,city_Yreka,city_Yuba City,city_Yucaipa,city_Yucca Valley,city_Zamora,city_Zillah,city_carlsbad
0,0,90001,30,33.97653,-118.24923,18,12,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,90002,4,33.948102,-118.248582,3,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,90003,20,33.971906,-118.272539,16,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,90004,29,34.076563,-118.310331,10,19,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,90005,45,34.059083,-118.298835,11,33,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
main_list.append('income')

In [62]:
len(main_list)

111

In [63]:
main_df = df[main_list]
main_df.shape

(2373, 111)

# Main DF with `income` is called `main_df`
- 111 columns since income is included 

In [64]:
main_df.head()

Unnamed: 0,acaibowls,asianfusion,bakeries,breakfast_brunch,bubbletea,cafes,coffee,cupcakes,desserts,french,...,city_Spokane,city_Stockton,city_Sumpter,city_Thermal,city_Vernon,city_Wilderville,price_1_percentage,rate_2_percentage,count,income
0,0,0,1,1,0,0,0,1,1,0,...,0,0,0,0,0,0,0.6,0.0,30,35660
1,0,0,0,1,0,0,1,0,1,0,...,0,0,0,0,0,0,0.75,0.0,4,34000
2,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0.8,0.0,20,34397
3,0,1,1,1,0,1,1,0,1,0,...,0,0,0,0,0,0,0.344828,0.0,29,46581
4,0,1,1,1,1,1,1,0,1,0,...,0,0,0,0,0,0,0.244444,0.0,45,32461


# Modeling Time!

In [65]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Lasso, LinearRegression, Ridge, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split

In [66]:
X = main_df.drop(labels = 'income', axis = 'columns')

In [67]:
y = main_df['income']

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Here are the following regression models we will use to predict median income

In [69]:
lasso = Lasso()
lr = LinearRegression()
ridge = Ridge()
rf = RandomForestRegressor()
gb = GradientBoostingRegressor()
ab = AdaBoostRegressor()
svr = SVR()
elastic = ElasticNet()

In [70]:
df['income'].mean()
# 66,761 is my baseline

66761.95195954488

In [71]:
print(f' Lasso: {cross_val_score(lasso, X_train, y_train, cv = 5).mean()}')
print(f' LR: {cross_val_score(lr, X_train, y_train, cv = 5).mean()}')
print(f' Ridge: {cross_val_score(ridge, X_train, y_train, cv = 5).mean()}')
print(f' RF: {cross_val_score(rf, X_train, y_train, cv = 5).mean()}')
print(f' GB: {cross_val_score(gb, X_train, y_train, cv = 5).mean()}')


 Lasso: 0.3079124852415288
 LR: 0.3074057825904798
 Ridge: 0.30877121530481666




 RF: 0.1947913496439921
 GB: 0.2803103292535628


In [72]:
print(f' AB: {cross_val_score(ab, X_train, y_train, cv = 5).mean()}')
print(f' SVR: {cross_val_score(svr, X_train, y_train, cv = 5).mean()}')


 AB: -0.11115810967624032




 SVR: -0.049297392508804536


In [73]:

print(f' Elastic: {cross_val_score(elastic, X_train, y_train, cv = 5).mean()}')



 Bernoulli: 0.06652302035990097




 Gaussian: 0.05598510882016036




 Multi: 0.011182241436647823
 Elastic: 0.17533698263930447


# Trying PCA

In [74]:
from sklearn.decomposition import PCA

In [75]:
pca = PCA()
pca.fit(X_train)
pca_xtrain = pca.transform(X_train)

In [76]:
print(f' Lasso: {cross_val_score(lasso, pca_xtrain, y_train, cv = 5).mean()}')
print(f' LR: {cross_val_score(lr, pca_xtrain, y_train, cv = 5).mean()}')
print(f' Ridge: {cross_val_score(ridge, pca_xtrain, y_train, cv = 5).mean()}')
print(f' RF: {cross_val_score(rf, pca_xtrain, y_train, cv = 5).mean()}')
print(f' GB: {cross_val_score(gb, pca_xtrain, y_train, cv = 5).mean()}')

 Lasso: 0.32490900671777306
 LR: 0.3074057825904817
 Ridge: 0.30877121530481694




 RF: 0.18870787771517833
 GB: 0.32457709382144595


In [77]:
print(f' AB: {cross_val_score(ab, pca_xtrain, y_train, cv = 5).mean()}')
print(f' SVR: {cross_val_score(svr, pca_xtrain, y_train, cv = 5).mean()}')

 AB: 0.141666322838698




 SVR: -0.049297392508804536


In [78]:
print(f' Bernoulli: {cross_val_score(bernoulli, pca_xtrain, y_train, cv = 3).mean()}')
print(f' Gaussian: {cross_val_score(gaussian, pca_xtrain, y_train, cv = 3).mean()}')
print(f' Elastic: {cross_val_score(elastic, pca_xtrain, y_train, cv = 5).mean()}')



 Bernoulli: 0.0006443298969072165




 Gaussian: 0.05598510882016036
 Elastic: 0.1753585867969995


# Conclusion

By executing principal component analysis, we will gridsearch the following models with PCA:
- Lasso (cross val: .3249)
- Linear Regression (cross val: .3074)
- Ridge Regression (cross val: .3087)
- Random Forest (cross val: .2195)
- Gradient Boosting (cross val: .3295)

Our test metric will be **RMSE**.

In [79]:
from sklearn.metrics import mean_squared_error

In [80]:
pca_xtest = pca.transform(X_test)

# GridSearch Time

### Lasso (gs)

In [80]:
pipe = Pipeline([
    ('l', Lasso())
])

pipe_params = {
    'l__alpha': [0, 0.05, 0.01, 1],
    'l__max_iter': [1000, 2000, 5000]
   
}


gs = GridSearchCV(pipe, param_grid=pipe_params, cv=5, n_jobs = 3)


gs.fit(pca_xtrain, y_train)


print(gs.best_score_)
gs.best_params_

0.32710053631165337


{'l__alpha': 0.05, 'l__max_iter': 2000}

In [82]:
gs.score(pca_xtrain, y_train)

0.48790502981554396

In [83]:
gs.score(pca_xtest, y_test)

0.32535259831839036

In [84]:
lasso_pred = gs.predict(pca_xtest)

In [86]:
print(f' RMSE: {mean_squared_error(y_test, lasso_pred)**0.5}')

 RMSE: 24043.3317456937


### Linear Regression (gs2)

In [89]:
pipe = Pipeline([
    ('lr', LinearRegression())
])

pipe_params = {
    'lr__fit_intercept': [True, False],
    'lr__normalize': [True, False],
    'lr__copy_X': [True, False]
  
   
}


gs2 = GridSearchCV(pipe, param_grid=pipe_params, cv=5, n_jobs = 3)


gs2.fit(pca_xtrain, y_train)


print(gs2.best_score_)
gs2.best_params_

0.3073640187357756




{'lr__copy_X': True, 'lr__fit_intercept': True, 'lr__normalize': False}

In [90]:
gs2.score(pca_xtrain, y_train)

0.4879052363678264

In [91]:
gs2.score(pca_xtest, y_test)

0.32533101646386176

In [92]:
lr_pred = gs2.predict(pca_xtest)

In [93]:
print(f' RMSE: {mean_squared_error(y_test, lr_pred)**0.5}')

 RMSE: 24043.716313645058


### Ridge Regression (gs3)

In [94]:
pipe = Pipeline([
    ('r', Ridge())
])

pipe_params = {
    'r__alpha': [0.01, 0.1, 0.5, 1],
    'r__fit_intercept': [True, False],
    'r__normalize': [True, False],
    'r__copy_X': [True, False],
    'r__solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
  
  
}


gs3 = GridSearchCV(pipe, param_grid=pipe_params, cv=5, n_jobs = 3)


gs3.fit(pca_xtrain, y_train)


print(gs3.best_score_)
gs3.best_params_

0.3100156339681715




{'r__alpha': 0.5,
 'r__copy_X': True,
 'r__fit_intercept': True,
 'r__normalize': False,
 'r__solver': 'svd'}

In [95]:
gs3.score(pca_xtrain, y_train)

0.47319948505533793

In [96]:
gs3.score(pca_xtest, y_test)

0.3241692096069978

In [97]:
ridge_pred = gs3.predict(pca_xtest)

In [98]:
print(f' RMSE: {mean_squared_error(y_test, ridge_pred)**0.5}')

 RMSE: 24064.40952733273


### Random Forest (gs4)

In [99]:
pipe = Pipeline([
    ('rf', RandomForestRegressor())
])

pipe_params = {
    'rf__max_depth': [None, 10, 30, 100],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 5, 9],
    'rf__max_features': ['auto', 'sqrt', 'log2'],
    'rf__max_leaf_nodes': [None, 2],
    'rf__warm_start': [True, False]
    

  
}


gs4 = GridSearchCV(pipe, param_grid=pipe_params, cv=5, n_jobs = 3)


gs4.fit(pca_xtrain, y_train)


print(gs4.best_score_)
gs4.best_params_



0.2685183554572861


{'rf__max_depth': 10,
 'rf__max_features': 'auto',
 'rf__max_leaf_nodes': None,
 'rf__min_samples_leaf': 1,
 'rf__min_samples_split': 5,
 'rf__warm_start': True}

In [100]:
gs4.score(pca_xtrain, y_train)

0.6793401390012577

In [101]:
gs4.score(pca_xtest, y_test)

0.21433313038686996

In [102]:
rf_pred = gs4.predict(pca_xtest)

In [103]:
print(f' RMSE: {mean_squared_error(y_test, rf_pred)**0.5}')

 RMSE: 25946.30044863297


### Gradient Boost (gs5)

In [81]:
pipe = Pipeline([
    ('gb', GradientBoostingRegressor())
])

pipe_params = {
    'gb__learning_rate': [0.1, 0.05, 0.3],
    'gb__n_estimators': [100, 200, 500],
    #'gb__min_samples_split': [2, 3, 5],
    #'gb__min_samples_leaf': [1, 2, 5],
    #'gb__max_leaf_nodes': [None, 2],
    'gb__max_depth': [3, 2],
    'gb__max_features': [None, 2, 5],
    #'gb__alpha': [0.9, 0.5, 0.1]
  
}


gs5 = GridSearchCV(pipe, param_grid=pipe_params, cv=5, n_jobs = 3)


gs5.fit(pca_xtrain, y_train)


print(gs5.best_score_)
gs5.best_params_

0.32709233496702295


{'gb__learning_rate': 0.1,
 'gb__max_depth': 3,
 'gb__max_features': None,
 'gb__n_estimators': 200}

In [82]:
gs5.score(pca_xtrain, y_train)

0.815750062811482

In [83]:
gs5.score(pca_xtest, y_test)

0.3147679612082943

In [84]:
gb_pred = gs5.predict(pca_xtest)

In [85]:
print(f' RMSE: {mean_squared_error(y_test, gb_pred)**0.5}')

 RMSE: 24231.207303454954


# Conclusion 
- Given the time we have, our ideal model is gradient boosting

Let's first transform our demo testing csv!

In [92]:
demo = pd.read_csv('./datasets/MARIOdemoMODEL.csv')
demo.head()

Unnamed: 0.1,Unnamed: 0,zip_code,acaibowls,asianfusion,bakeries,breakfast_brunch,bubbletea,cafes,coffee,cupcakes,...,city_Slab City,city_Spokane,city_Stockton,city_Sumpter,city_Thermal,city_Vernon,city_Wilderville,price_1_percentage,rate_2_percentage,count
0,0,90071,9.405622,-0.836703,0.021364,-0.969478,0.24738,0.737829,1.140817,-0.152386,...,-1.828033e-17,-1.465002e-17,1.569528e-17,-4.791515e-18,4.973684e-18,3.515378e-31,-1.38921e-32,-7.601189000000001e-33,-2.366531e-16,4.09663e-16
1,1,91980,-27.695649,0.724556,0.335565,-0.112465,0.159981,0.005022,-0.157311,-0.017526,...,5.555756e-17,3.9807330000000006e-17,4.630032e-20,4.941619e-18,3.9967330000000005e-17,3.71372e-31,3.7035890000000004e-33,-1.257503e-33,5.208108e-16,-2.823416e-16
2,2,92055,-18.678026,0.433605,-0.448387,0.093646,0.473326,0.054742,-0.123058,-0.299356,...,2.0170890000000003e-17,-2.798521e-17,-1.172232e-17,-1.4012410000000002e-17,3.281239e-18,5.727834000000001e-31,2.8112420000000002e-33,5.6431150000000005e-33,-8.677741000000001e-17,7.063863e-17
3,3,92060,-27.6989,0.989627,0.725841,0.02304,-0.060072,-0.04532,-0.008747,0.211025,...,-1.1404510000000001e-17,-1.056607e-17,-7.913139000000001e-18,3.36414e-18,-9.426829999999998e-19,-5.192086e-31,8.803512e-33,-5.211948e-33,-2.239996e-16,8.003911e-17
4,4,92135,-25.699205,0.877975,0.250501,0.170778,0.19929,0.058945,-0.170717,0.264019,...,1.0602750000000002e-17,-2.431599e-17,-1.5523500000000002e-17,3.5961e-18,8.358603e-18,1.31404e-30,1.054766e-33,4.228426e-33,-6.781590000000001e-17,1.126875e-16


In [93]:
demo.drop(labels = ['Unnamed: 0'], axis = 'columns', inplace = True)
demo.head()

Unnamed: 0,zip_code,acaibowls,asianfusion,bakeries,breakfast_brunch,bubbletea,cafes,coffee,cupcakes,desserts,...,city_Slab City,city_Spokane,city_Stockton,city_Sumpter,city_Thermal,city_Vernon,city_Wilderville,price_1_percentage,rate_2_percentage,count
0,90071,9.405622,-0.836703,0.021364,-0.969478,0.24738,0.737829,1.140817,-0.152386,0.072772,...,-1.828033e-17,-1.465002e-17,1.569528e-17,-4.791515e-18,4.973684e-18,3.515378e-31,-1.38921e-32,-7.601189000000001e-33,-2.366531e-16,4.09663e-16
1,91980,-27.695649,0.724556,0.335565,-0.112465,0.159981,0.005022,-0.157311,-0.017526,0.268266,...,5.555756e-17,3.9807330000000006e-17,4.630032e-20,4.941619e-18,3.9967330000000005e-17,3.71372e-31,3.7035890000000004e-33,-1.257503e-33,5.208108e-16,-2.823416e-16
2,92055,-18.678026,0.433605,-0.448387,0.093646,0.473326,0.054742,-0.123058,-0.299356,-0.292397,...,2.0170890000000003e-17,-2.798521e-17,-1.172232e-17,-1.4012410000000002e-17,3.281239e-18,5.727834000000001e-31,2.8112420000000002e-33,5.6431150000000005e-33,-8.677741000000001e-17,7.063863e-17
3,92060,-27.6989,0.989627,0.725841,0.02304,-0.060072,-0.04532,-0.008747,0.211025,0.032911,...,-1.1404510000000001e-17,-1.056607e-17,-7.913139000000001e-18,3.36414e-18,-9.426829999999998e-19,-5.192086e-31,8.803512e-33,-5.211948e-33,-2.239996e-16,8.003911e-17
4,92135,-25.699205,0.877975,0.250501,0.170778,0.19929,0.058945,-0.170717,0.264019,0.142209,...,1.0602750000000002e-17,-2.431599e-17,-1.5523500000000002e-17,3.5961e-18,8.358603e-18,1.31404e-30,1.054766e-33,4.228426e-33,-6.781590000000001e-17,1.126875e-16


In [94]:
demo.shape

(50, 111)

In [95]:
zips_demo = list(demo['zip_code'])

In [96]:
features = list(demo.columns)


In [97]:
features.remove('zip_code')

In [98]:
len(features)

110

In [131]:
demo_feat = demo[features]

In [132]:
demo_feat.shape

(50, 110)

In [133]:
demo_feat_pca = pca.transform(demo_feat)

In [137]:
demo_feat_pca_df = pd.DataFrame(demo_feat_pca, columns = features)

In [138]:
demo_feat_pca_df.head()

Unnamed: 0,acaibowls,asianfusion,bakeries,breakfast_brunch,bubbletea,cafes,coffee,cupcakes,desserts,french,...,city_Slab City,city_Spokane,city_Stockton,city_Sumpter,city_Thermal,city_Vernon,city_Wilderville,price_1_percentage,rate_2_percentage,count
0,9.405622,-0.836703,0.021364,-0.969478,0.24738,0.737829,1.140817,-0.152386,0.072772,-0.192185,...,-1.828033e-17,-1.465002e-17,1.569528e-17,-4.791515e-18,4.973684e-18,3.515378e-31,-1.38921e-32,-7.601189000000001e-33,-2.366531e-16,4.09663e-16
1,-27.695649,0.724556,0.335565,-0.112465,0.159981,0.005022,-0.157311,-0.017526,0.268266,0.089386,...,5.555756e-17,3.9807330000000006e-17,4.630032e-20,4.941619e-18,3.9967330000000005e-17,3.71372e-31,3.7035890000000004e-33,-1.257503e-33,5.208108e-16,-2.823416e-16
2,-18.678026,0.433605,-0.448387,0.093646,0.473326,0.054742,-0.123058,-0.299356,-0.292397,0.263258,...,2.0170890000000003e-17,-2.798521e-17,-1.172232e-17,-1.4012410000000002e-17,3.281239e-18,5.727834000000001e-31,2.8112420000000002e-33,5.6431150000000005e-33,-8.677741000000001e-17,7.063863e-17
3,-27.6989,0.989627,0.725841,0.02304,-0.060072,-0.04532,-0.008747,0.211025,0.032911,-0.067531,...,-1.1404510000000001e-17,-1.056607e-17,-7.913139000000001e-18,3.36414e-18,-9.426829999999998e-19,-5.192086e-31,8.803512e-33,-5.211948e-33,-2.239996e-16,8.003911e-17
4,-25.699205,0.877975,0.250501,0.170778,0.19929,0.058945,-0.170717,0.264019,0.142209,-0.24172,...,1.0602750000000002e-17,-2.431599e-17,-1.5523500000000002e-17,3.5961e-18,8.358603e-18,1.31404e-30,1.054766e-33,4.228426e-33,-6.781590000000001e-17,1.126875e-16


In [141]:
idx = 0
demo_feat_pca_df.insert(loc=idx, column='zip_code', value=zips_demo)

In [142]:
demo_feat_pca_df.head()

Unnamed: 0,zip_code,acaibowls,asianfusion,bakeries,breakfast_brunch,bubbletea,cafes,coffee,cupcakes,desserts,...,city_Slab City,city_Spokane,city_Stockton,city_Sumpter,city_Thermal,city_Vernon,city_Wilderville,price_1_percentage,rate_2_percentage,count
0,90071,9.405622,-0.836703,0.021364,-0.969478,0.24738,0.737829,1.140817,-0.152386,0.072772,...,-1.828033e-17,-1.465002e-17,1.569528e-17,-4.791515e-18,4.973684e-18,3.515378e-31,-1.38921e-32,-7.601189000000001e-33,-2.366531e-16,4.09663e-16
1,91980,-27.695649,0.724556,0.335565,-0.112465,0.159981,0.005022,-0.157311,-0.017526,0.268266,...,5.555756e-17,3.9807330000000006e-17,4.630032e-20,4.941619e-18,3.9967330000000005e-17,3.71372e-31,3.7035890000000004e-33,-1.257503e-33,5.208108e-16,-2.823416e-16
2,92055,-18.678026,0.433605,-0.448387,0.093646,0.473326,0.054742,-0.123058,-0.299356,-0.292397,...,2.0170890000000003e-17,-2.798521e-17,-1.172232e-17,-1.4012410000000002e-17,3.281239e-18,5.727834000000001e-31,2.8112420000000002e-33,5.6431150000000005e-33,-8.677741000000001e-17,7.063863e-17
3,92060,-27.6989,0.989627,0.725841,0.02304,-0.060072,-0.04532,-0.008747,0.211025,0.032911,...,-1.1404510000000001e-17,-1.056607e-17,-7.913139000000001e-18,3.36414e-18,-9.426829999999998e-19,-5.192086e-31,8.803512e-33,-5.211948e-33,-2.239996e-16,8.003911e-17
4,92135,-25.699205,0.877975,0.250501,0.170778,0.19929,0.058945,-0.170717,0.264019,0.142209,...,1.0602750000000002e-17,-2.431599e-17,-1.5523500000000002e-17,3.5961e-18,8.358603e-18,1.31404e-30,1.054766e-33,4.228426e-33,-6.781590000000001e-17,1.126875e-16


In [143]:
demo_feat_pca_df.shape

(50, 111)

In [148]:
demo_feat_pca_df.to_csv('./datasets/MARIO_MODEL_PCA.csv')

In [149]:
# df = pd.read_csv('./datasets/MARIOdemo.csv')
# df.head()

In [150]:
demo_feat_pca_df.head()

Unnamed: 0,zip_code,acaibowls,asianfusion,bakeries,breakfast_brunch,bubbletea,cafes,coffee,cupcakes,desserts,...,city_Slab City,city_Spokane,city_Stockton,city_Sumpter,city_Thermal,city_Vernon,city_Wilderville,price_1_percentage,rate_2_percentage,count
0,90071,9.405622,-0.836703,0.021364,-0.969478,0.24738,0.737829,1.140817,-0.152386,0.072772,...,-1.828033e-17,-1.465002e-17,1.569528e-17,-4.791515e-18,4.973684e-18,3.515378e-31,-1.38921e-32,-7.601189000000001e-33,-2.366531e-16,4.09663e-16
1,91980,-27.695649,0.724556,0.335565,-0.112465,0.159981,0.005022,-0.157311,-0.017526,0.268266,...,5.555756e-17,3.9807330000000006e-17,4.630032e-20,4.941619e-18,3.9967330000000005e-17,3.71372e-31,3.7035890000000004e-33,-1.257503e-33,5.208108e-16,-2.823416e-16
2,92055,-18.678026,0.433605,-0.448387,0.093646,0.473326,0.054742,-0.123058,-0.299356,-0.292397,...,2.0170890000000003e-17,-2.798521e-17,-1.172232e-17,-1.4012410000000002e-17,3.281239e-18,5.727834000000001e-31,2.8112420000000002e-33,5.6431150000000005e-33,-8.677741000000001e-17,7.063863e-17
3,92060,-27.6989,0.989627,0.725841,0.02304,-0.060072,-0.04532,-0.008747,0.211025,0.032911,...,-1.1404510000000001e-17,-1.056607e-17,-7.913139000000001e-18,3.36414e-18,-9.426829999999998e-19,-5.192086e-31,8.803512e-33,-5.211948e-33,-2.239996e-16,8.003911e-17
4,92135,-25.699205,0.877975,0.250501,0.170778,0.19929,0.058945,-0.170717,0.264019,0.142209,...,1.0602750000000002e-17,-2.431599e-17,-1.5523500000000002e-17,3.5961e-18,8.358603e-18,1.31404e-30,1.054766e-33,4.228426e-33,-6.781590000000001e-17,1.126875e-16


In [151]:
demo_feat_pca_df.shape

(50, 111)

# Compressing Model Into Pickle for Flask

In [86]:
#importing prediction model to pickle 
import pickle

In [87]:
pickle.format_version

'4.0'

In [88]:
import sklearn

In [89]:
sklearn.__version__

'0.21.2'

In [90]:
pickle.dump(gs5, open("model_gb_mother.pkl", "wb"))

---

# Demo Testing
- Note: The following codes are not commented since we are predicting our demo datasets to make sure our Flask app is predicting income properly

In [94]:
demo.head()

Unnamed: 0,zip_code,acaibowls,asianfusion,bakeries,breakfast_brunch,bubbletea,cafes,coffee,cupcakes,desserts,...,city_Slab City,city_Spokane,city_Stockton,city_Sumpter,city_Thermal,city_Vernon,city_Wilderville,price_1_percentage,rate_2_percentage,count
0,90071,9.405622,-0.836703,0.021364,-0.969478,0.24738,0.737829,1.140817,-0.152386,0.072772,...,-1.828033e-17,-1.465002e-17,1.569528e-17,-4.791515e-18,4.973684e-18,3.515378e-31,-1.38921e-32,-7.601189000000001e-33,-2.366531e-16,4.09663e-16
1,91980,-27.695649,0.724556,0.335565,-0.112465,0.159981,0.005022,-0.157311,-0.017526,0.268266,...,5.555756e-17,3.9807330000000006e-17,4.630032e-20,4.941619e-18,3.9967330000000005e-17,3.71372e-31,3.7035890000000004e-33,-1.257503e-33,5.208108e-16,-2.823416e-16
2,92055,-18.678026,0.433605,-0.448387,0.093646,0.473326,0.054742,-0.123058,-0.299356,-0.292397,...,2.0170890000000003e-17,-2.798521e-17,-1.172232e-17,-1.4012410000000002e-17,3.281239e-18,5.727834000000001e-31,2.8112420000000002e-33,5.6431150000000005e-33,-8.677741000000001e-17,7.063863e-17
3,92060,-27.6989,0.989627,0.725841,0.02304,-0.060072,-0.04532,-0.008747,0.211025,0.032911,...,-1.1404510000000001e-17,-1.056607e-17,-7.913139000000001e-18,3.36414e-18,-9.426829999999998e-19,-5.192086e-31,8.803512e-33,-5.211948e-33,-2.239996e-16,8.003911e-17
4,92135,-25.699205,0.877975,0.250501,0.170778,0.19929,0.058945,-0.170717,0.264019,0.142209,...,1.0602750000000002e-17,-2.431599e-17,-1.5523500000000002e-17,3.5961e-18,8.358603e-18,1.31404e-30,1.054766e-33,4.228426e-33,-6.781590000000001e-17,1.126875e-16


In [95]:
list_feat = list(demo.columns)
list_feat.remove('zip_code')

In [96]:
len(list_feat)

110

In [97]:
demo_feat = demo[list_feat]

In [98]:
demo_feat.head()

Unnamed: 0,acaibowls,asianfusion,bakeries,breakfast_brunch,bubbletea,cafes,coffee,cupcakes,desserts,french,...,city_Slab City,city_Spokane,city_Stockton,city_Sumpter,city_Thermal,city_Vernon,city_Wilderville,price_1_percentage,rate_2_percentage,count
0,9.405622,-0.836703,0.021364,-0.969478,0.24738,0.737829,1.140817,-0.152386,0.072772,-0.192185,...,-1.828033e-17,-1.465002e-17,1.569528e-17,-4.791515e-18,4.973684e-18,3.515378e-31,-1.38921e-32,-7.601189000000001e-33,-2.366531e-16,4.09663e-16
1,-27.695649,0.724556,0.335565,-0.112465,0.159981,0.005022,-0.157311,-0.017526,0.268266,0.089386,...,5.555756e-17,3.9807330000000006e-17,4.630032e-20,4.941619e-18,3.9967330000000005e-17,3.71372e-31,3.7035890000000004e-33,-1.257503e-33,5.208108e-16,-2.823416e-16
2,-18.678026,0.433605,-0.448387,0.093646,0.473326,0.054742,-0.123058,-0.299356,-0.292397,0.263258,...,2.0170890000000003e-17,-2.798521e-17,-1.172232e-17,-1.4012410000000002e-17,3.281239e-18,5.727834000000001e-31,2.8112420000000002e-33,5.6431150000000005e-33,-8.677741000000001e-17,7.063863e-17
3,-27.6989,0.989627,0.725841,0.02304,-0.060072,-0.04532,-0.008747,0.211025,0.032911,-0.067531,...,-1.1404510000000001e-17,-1.056607e-17,-7.913139000000001e-18,3.36414e-18,-9.426829999999998e-19,-5.192086e-31,8.803512e-33,-5.211948e-33,-2.239996e-16,8.003911e-17
4,-25.699205,0.877975,0.250501,0.170778,0.19929,0.058945,-0.170717,0.264019,0.142209,-0.24172,...,1.0602750000000002e-17,-2.431599e-17,-1.5523500000000002e-17,3.5961e-18,8.358603e-18,1.31404e-30,1.054766e-33,4.228426e-33,-6.781590000000001e-17,1.126875e-16


In [99]:
import numpy as np

In [100]:
array = np.array(demo[demo['zip_code'] == 90071].drop(columns ='zip_code'))

In [101]:
array

array([[ 9.40562240e+00, -8.36703468e-01,  2.13643894e-02,
        -9.69478172e-01,  2.47380497e-01,  7.37828982e-01,
         1.14081733e+00, -1.52385712e-01,  7.27718717e-02,
        -1.92185462e-01,  8.15720898e-01,  4.09722324e-02,
        -1.27796487e-01,  3.69545102e-01, -4.89125943e-01,
        -4.99931973e-01,  4.12616359e-01,  6.99446434e-02,
         3.58433581e-01,  3.92340520e-01,  9.98630758e-02,
         1.32650969e-01, -1.98122454e-01, -3.31552303e-01,
        -4.47515356e-01, -3.04447371e-01, -5.75273554e-01,
        -1.63909962e-01,  6.92422923e-02, -2.38906367e-01,
        -5.08165973e-01,  5.96108091e-01,  4.92681338e-01,
         6.84327414e-01,  1.40888089e-01,  2.47337718e-02,
         4.30287352e-02, -1.16284284e-02, -6.74541661e-03,
         1.20234523e-01,  7.74187242e-02, -8.53350381e-03,
         6.38137406e-03, -1.49005981e-02, -3.28007591e-02,
         4.39618155e-03, -8.60705161e-03,  1.47835380e-02,
         8.94605987e-03, -5.92470329e-04, -1.08343917e-0

In [102]:
array_pred = gs5.predict(array)

In [103]:
array_pred[0]

116833.61618905456

In [104]:
demo_pred = gs5.predict(demo_feat)

In [105]:
demo_pred

array([116833.61618905,  59505.69331012,  61888.89231624,  47812.80603393,
        48393.38419338,  47696.23345529,  51255.11533618,  61216.20479305,
        78693.51484965,  56651.64652188,  60139.03994093, 100064.2093001 ,
        62381.54313579,  66861.63821374,  70642.16438074,  71986.63045006,
        47445.55804757,  54832.50825617,  56747.1143576 ,  70400.475586  ,
        55447.04785494,  59094.170481  ,  52314.58316809,  58967.76329166,
        55504.61438394,  59678.73556664,  47369.88195339,  56469.86490976,
        56651.64652188,  51536.91754433,  54781.90695333,  58109.28921128,
        53793.41870002,  58084.89767344,  62500.22096668,  51231.45925496,
        58559.07068896,  53343.61503634,  62271.17418979,  55836.15046432,
        64212.76673122,  88437.69187836,  48434.5602812 ,  54832.50825617,
        53536.41824756,  52799.47242077,  52314.58316809,  47126.38394659,
        50184.46520465,  49851.27407229])

In [106]:
pred = pd.DataFrame(demo_pred, columns = ['prediction'])

In [107]:
demo.head()

Unnamed: 0,zip_code,acaibowls,asianfusion,bakeries,breakfast_brunch,bubbletea,cafes,coffee,cupcakes,desserts,...,city_Slab City,city_Spokane,city_Stockton,city_Sumpter,city_Thermal,city_Vernon,city_Wilderville,price_1_percentage,rate_2_percentage,count
0,90071,9.405622,-0.836703,0.021364,-0.969478,0.24738,0.737829,1.140817,-0.152386,0.072772,...,-1.828033e-17,-1.465002e-17,1.569528e-17,-4.791515e-18,4.973684e-18,3.515378e-31,-1.38921e-32,-7.601189000000001e-33,-2.366531e-16,4.09663e-16
1,91980,-27.695649,0.724556,0.335565,-0.112465,0.159981,0.005022,-0.157311,-0.017526,0.268266,...,5.555756e-17,3.9807330000000006e-17,4.630032e-20,4.941619e-18,3.9967330000000005e-17,3.71372e-31,3.7035890000000004e-33,-1.257503e-33,5.208108e-16,-2.823416e-16
2,92055,-18.678026,0.433605,-0.448387,0.093646,0.473326,0.054742,-0.123058,-0.299356,-0.292397,...,2.0170890000000003e-17,-2.798521e-17,-1.172232e-17,-1.4012410000000002e-17,3.281239e-18,5.727834000000001e-31,2.8112420000000002e-33,5.6431150000000005e-33,-8.677741000000001e-17,7.063863e-17
3,92060,-27.6989,0.989627,0.725841,0.02304,-0.060072,-0.04532,-0.008747,0.211025,0.032911,...,-1.1404510000000001e-17,-1.056607e-17,-7.913139000000001e-18,3.36414e-18,-9.426829999999998e-19,-5.192086e-31,8.803512e-33,-5.211948e-33,-2.239996e-16,8.003911e-17
4,92135,-25.699205,0.877975,0.250501,0.170778,0.19929,0.058945,-0.170717,0.264019,0.142209,...,1.0602750000000002e-17,-2.431599e-17,-1.5523500000000002e-17,3.5961e-18,8.358603e-18,1.31404e-30,1.054766e-33,4.228426e-33,-6.781590000000001e-17,1.126875e-16


In [109]:
list_ofzips = list(demo['zip_code'])

In [110]:
pred.head()

Unnamed: 0,prediction
0,116833.616189
1,59505.69331
2,61888.892316
3,47812.806034
4,48393.384193


In [111]:
pred['zip_code'] = list_ofzips

In [112]:
pred.head()

Unnamed: 0,prediction,zip_code
0,116833.616189,90071
1,59505.69331,91980
2,61888.892316,92055
3,47812.806034,92060
4,48393.384193,92135


In [113]:
pred.isnull().sum().sum()

0

In [114]:
pred.shape

(50, 2)

In [115]:
pred.to_csv('./datasets/DEMO_PREDICTIONS_8.1.csv')