# Appending all columns together
- After manually finding median income, there are some zip codes where median income is 0
- We will drop these zip codes since it will create a bias with our features

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('./datasets/MOTHEROFMOTHERS.csv')
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,zip_code,count,latitude,longitude,Price_1,Price_2,Price_3,...,wine_bars,wineries,winetasteclasses,winetastingroom,winetours,womenscloth,wraps,yelpevents,yoga,zoos
0,0,1,1,90001,30,33.97653,-118.24923,18,12,0,...,0,0,0,0,0,0,0,0,0,0
1,1,2,2,90002,4,33.948102,-118.248582,3,1,0,...,0,0,0,0,0,0,0,0,0,0
2,2,3,3,90003,20,33.971906,-118.272539,16,4,0,...,0,0,0,0,0,0,0,0,0,0
3,3,4,4,90004,29,34.076563,-118.310331,10,19,0,...,0,0,0,0,0,0,0,0,0,0
4,4,5,5,90005,45,34.059083,-118.298835,11,33,1,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df.shape

(2423, 501)

### This is the csv where we manually collected median income

In [10]:
manual_zips = pd.read_csv('./datasets/erin_zips_excel.csv', header = None)
manual_zips.head()

Unnamed: 0,0,1
0,90049,121671
1,90620,84942
2,91202,66657
3,91740,76825
4,91917,78929


In [11]:
manual_zips.shape

(124, 2)

In [12]:
manual_zips.columns = ['zip_code', 'income']

In [13]:
manual_zips.head()

Unnamed: 0,zip_code,income
0,90049,121671
1,90620,84942
2,91202,66657
3,91740,76825
4,91917,78929


### Bringing in the good zip codes that had the median income from the first phase of scraping

In [14]:
scrape_zips = pd.read_csv('./datasets/GOODZIPS2299.CSV')
scrape_zips.head()

Unnamed: 0.1,Unnamed: 0,index,zip,income
0,0,0,90001,35660
1,1,1,90002,34000
2,2,2,90003,34397
3,3,3,90004,46581
4,4,4,90005,32461


In [15]:
#removing unnecessary columns
scrape_zips.drop(labels = ['Unnamed: 0', 'index'], axis= 'columns', inplace= True)

In [17]:
scrape_zips.shape

(2299, 2)

In [19]:
scrape_zips.columns = ['zip_code', 'income']

In [20]:
scrape_zips.head()

Unnamed: 0,zip_code,income
0,90001,35660
1,90002,34000
2,90003,34397
3,90004,46581
4,90005,32461


In [18]:
manual_zips.head()

Unnamed: 0,zip_code,income
0,90049,121671
1,90620,84942
2,91202,66657
3,91740,76825
4,91917,78929


In [21]:
#sanity
manual_zips.dtypes

zip_code    int64
income      int64
dtype: object

In [22]:
#sanity
scrape_zips.dtypes

zip_code    int64
income      int64
dtype: object

In [23]:
#joining dataframes 
frames = [manual_zips, scrape_zips]
all_zips_income = pd.concat(frames)

In [24]:
all_zips_income.head()

Unnamed: 0,zip_code,income
0,90049,121671
1,90620,84942
2,91202,66657
3,91740,76825
4,91917,78929


In [25]:
all_zips_income.shape

(2423, 2)

In [26]:
all_zips_income.reset_index(inplace= True)

In [27]:
all_zips_income.shape

(2423, 3)

In [31]:
len(all_zips_income[all_zips_income['income'] ==0])

#there are 50 zip codes where median is 0
#includes where both median and averages are 0
#or median is 0 and average is not 0 

50

# Zip Codes with Target Variable as 0
- We're going to drop them 
- Use these parameters as our demo dataset to predict income

In [104]:
#demo_zips is a df where all zip codes have median income of 0
demo_zips = all_zips_income[all_zips_income['income'] ==0]


In [105]:
demo_zips.head()

Unnamed: 0,index,zip_code,income
32,32,95430,0
65,65,98821,0
70,70,90071,0
72,72,91980,0
73,73,92055,0


In [106]:
demo_zips.shape

(50, 3)

In [107]:
#saving the demo zip codes 
demo_zips.to_csv('./datasets/demozipslist.csv')

In [42]:
df.dtypes

Unnamed: 0                 int64
Unnamed: 0.1               int64
Unnamed: 0.1.1             int64
zip_code                   int64
count                      int64
latitude                 float64
longitude                float64
Price_1                    int64
Price_2                    int64
Price_3                    int64
Price_4                    int64
Rating_1                   int64
Rating_1.5                 int64
Rating_2                   int64
Rating_2.5                 int64
Rating_3                   int64
Rating_3.5                 int64
Rating_4                   int64
Rating_4.5                 int64
Rating_5                   int64
count_businesses           int64
city                      object
Unnamed: 0.2               int64
index                      int64
zip_code.1                 int64
category1                 object
category2                 object
category3                 object
category4                 object
category5                 object
          

In [43]:
all_zips_income.dtypes

index       int64
zip_code    int64
income      int64
dtype: object

In [59]:
#merging all df where target variable is valid 
main_df = pd.merge(df, all_zips_income, left_on='zip_code', right_on='zip_code', how='left').drop(['index_y', 'index_x'], axis=1)

In [60]:
#sanity
main_df.isnull().sum().sum()

0

In [61]:
#sanity
main_df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,zip_code,count,latitude,longitude,Price_1,Price_2,Price_3,...,wineries,winetasteclasses,winetastingroom,winetours,womenscloth,wraps,yelpevents,yoga,zoos,income
0,0,1,1,90001,30,33.97653,-118.24923,18,12,0,...,0,0,0,0,0,0,0,0,0,35660
1,1,2,2,90002,4,33.948102,-118.248582,3,1,0,...,0,0,0,0,0,0,0,0,0,34000
2,2,3,3,90003,20,33.971906,-118.272539,16,4,0,...,0,0,0,0,0,0,0,0,0,34397
3,3,4,4,90004,29,34.076563,-118.310331,10,19,0,...,0,0,0,0,0,0,0,0,0,46581
4,4,5,5,90005,45,34.059083,-118.298835,11,33,1,...,0,0,0,0,0,0,0,0,0,32461


# Final:
- Our full dataset has 2423 rows

In [62]:
main_df.shape

(2423, 501)

In [63]:
main_df.drop(labels = ['Unnamed: 0', 'Unnamed: 0.1', 'Unnamed: 0.1.1'], axis = 'columns', inplace = True)

In [64]:
main_df.head()

Unnamed: 0,zip_code,count,latitude,longitude,Price_1,Price_2,Price_3,Price_4,Rating_1,Rating_1.5,...,wineries,winetasteclasses,winetastingroom,winetours,womenscloth,wraps,yelpevents,yoga,zoos,income
0,90001,30,33.97653,-118.24923,18,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,35660
1,90002,4,33.948102,-118.248582,3,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,34000
2,90003,20,33.971906,-118.272539,16,4,0,0,0,0,...,0,0,0,0,0,0,0,0,0,34397
3,90004,29,34.076563,-118.310331,10,19,0,0,0,0,...,0,0,0,0,0,0,0,0,0,46581
4,90005,45,34.059083,-118.298835,11,33,1,0,0,0,...,0,0,0,0,0,0,0,0,0,32461


In [69]:
all_columns = list(main_df.columns)

In [67]:
#dropping unnecessary columns
main_df.drop(labels = ['Unnamed: 0.2', 'zip_code.1'], axis = 'columns', inplace = True)

In [70]:
all_columns

['zip_code',
 'count',
 'latitude',
 'longitude',
 'Price_1',
 'Price_2',
 'Price_3',
 'Price_4',
 'Rating_1',
 'Rating_1.5',
 'Rating_2',
 'Rating_2.5',
 'Rating_3',
 'Rating_3.5',
 'Rating_4',
 'Rating_4.5',
 'Rating_5',
 'count_businesses',
 'city',
 'category1',
 'category2',
 'category3',
 'category4',
 'category5',
 'all',
 'list_cat',
 'cat_list',
 'acaibowls',
 'accessories',
 'active',
 'acupuncture',
 'adultedu',
 'afghani',
 'african',
 'airportlounges',
 'amusementparks',
 'antiques',
 'aquariums',
 'arabian',
 'arcades',
 'argentine',
 'armenian',
 'artclasses',
 'artmuseums',
 'arts',
 'artsandcrafts',
 'artschools',
 'artsupplies',
 'asianfusion',
 'attractionfarms',
 'australian',
 'austrian',
 'ayurveda',
 'bagels',
 'bakeries',
 'balloonservices',
 'bangladeshi',
 'banks',
 'barbers',
 'bars',
 'bartenders',
 'basque',
 'bbq',
 'beachequipmentrental',
 'beaches',
 'bedbreakfast',
 'beer_and_wine',
 'beerbar',
 'beergardens',
 'beertours',
 'belgian',
 'beverage_stores

In [72]:
main_df.shape

(2423, 496)

In [77]:
#sanity
468+4+13+11

496

In [78]:
#sanity (total zip codes - $0 zip codes)
2423-50

2373

In [80]:
main_df['income'].dtype

dtype('int64')

In [81]:
#creating df where we have a valid median income
df_2373 = main_df[main_df['income'] != 0]


In [82]:
#sanity
df_2373.shape

(2373, 496)

In [83]:
#just want to see the poorest zip code
df_2373[df_2373['income'] <3000]

Unnamed: 0,zip_code,count,latitude,longitude,Price_1,Price_2,Price_3,Price_4,Rating_1,Rating_1.5,...,wineries,winetasteclasses,winetastingroom,winetours,womenscloth,wraps,yelpevents,yoga,zoos,income
783,93641,1,36.697228,-119.014813,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2500


# Checking if rows have correct categories 

# Sanity Check part II
- To make sure everything is correct

In [84]:
df_2373.reset_index(inplace = True)

In [85]:
df_2373.head()

Unnamed: 0,index,zip_code,count,latitude,longitude,Price_1,Price_2,Price_3,Price_4,Rating_1,...,wineries,winetasteclasses,winetastingroom,winetours,womenscloth,wraps,yelpevents,yoga,zoos,income
0,0,90001,30,33.97653,-118.24923,18,12,0,0,0,...,0,0,0,0,0,0,0,0,0,35660
1,1,90002,4,33.948102,-118.248582,3,1,0,0,0,...,0,0,0,0,0,0,0,0,0,34000
2,2,90003,20,33.971906,-118.272539,16,4,0,0,0,...,0,0,0,0,0,0,0,0,0,34397
3,3,90004,29,34.076563,-118.310331,10,19,0,0,0,...,0,0,0,0,0,0,0,0,0,46581
4,4,90005,45,34.059083,-118.298835,11,33,1,0,0,...,0,0,0,0,0,0,0,0,0,32461


In [86]:
#this is the unique categories at index (aka zip code) 666
df_2373['cat_list'][666]

"['?' 'icecream' 'juicebars' 'mexican' 'sandwiches']"

In [87]:
#checking specific row where cells ==1 (whihc returns column names)
df_2373.columns[(df_2373 == 1).iloc[666]]

#it matches!

Index(['Price_1', 'Price_2', 'Rating_4', 'Rating_4.5', 'icecream', 'juicebars',
       'mexican', 'sandwiches'],
      dtype='object')

In [88]:
df_2373['count'][666]
#number of businesses is accurate 

2

In [96]:
df_2373.loc[666:666]
#income is accurate
#number of business is accurate 

Unnamed: 0,index,zip_code,count,latitude,longitude,Price_1,Price_2,Price_3,Price_4,Rating_1,...,wineries,winetasteclasses,winetastingroom,winetours,womenscloth,wraps,yelpevents,yoga,zoos,income
666,673,93267,2,36.145955,-119.068134,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,31959


In [99]:
df_2373.drop(labels = 'index', axis = 'columns', inplace= True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [100]:
df_2373.shape

(2373, 496)

# Since we have the `MOTHEROFMOTHER.csv`, we're calling this dataframe `Grandma.csv`

In [101]:
df_2373.to_csv('./datasets/Grandma.csv')

# Now it's modeling time!