In [5]:
# Loading libraries (tools)
import numpy as np # linear algebra
import pandas as pd # data processing
import matplotlib as mpl # primitive data visualization
import matplotlib.pyplot as plt # primitive plotting tool
import seaborn as sns # modern plotting tool

# First section: Loading Data

In [14]:
# Reference: 
# Overall project: https://www.kaggle.com/dgomonov/data-exploration-on-nyc-airbnb
# Read text file: https://www.geeksforgeeks.org/reading-writing-text-files-python/
# Read csv file: https://www.geeksforgeeks.org/python-read-csv-using-pandas-read_csv/
# In this tutorial we will focus on reading only the csv data

# define path to data (relative to current notebook folder)
data_file = './AB_NYC_2019.csv'

# create a variable container to store the data read from the csv file
# pandas package provides nice library to do this task for us.
airbnb = pd.read_csv(data_file)

# Getting the Feeling of Data

In [15]:
# View summarny of data
print(airbnb.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 16 columns):
id                                48895 non-null int64
name                              48879 non-null object
host_id                           48895 non-null int64
host_name                         48874 non-null object
neighbourhood_group               48895 non-null object
neighbourhood                     48895 non-null object
latitude                          48895 non-null float64
longitude                         48895 non-null float64
room_type                         48895 non-null object
price                             48895 non-null int64
minimum_nights                    48895 non-null int64
number_of_reviews                 48895 non-null int64
last_review                       38843 non-null object
reviews_per_month                 38843 non-null float64
calculated_host_listings_count    48895 non-null int64
availability_365                  48895 non-null int64

In [39]:
# We can see that some columns do not have the same number of records as the others
# The reason may be empty space or not-available (na) values.
# Sometimes, we use NaN (not-a-number) to define those values.
# We can count them using the following methods

airbnb.isnull().sum(axis=0)

id                                    0
name                                 16
host_id                               0
host_name                            21
boroughs                              0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [41]:
# We can also do this
pd.isnull(airbnb).sum()

id                                    0
name                                 16
host_id                               0
host_name                            21
boroughs                              0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [43]:
# We can count the number of NA values for a specific column
airbnb['last_review'].isnull().sum()

10052

In [40]:
# We can also count the NA values in each row (along a row)
airbnb.isnull().sum(axis=1)

0        0
1        0
2        2
3        0
4        0
5        0
6        0
7        0
8        0
9        0
10       0
11       0
12       0
13       0
14       0
15       0
16       0
17       0
18       0
19       2
20       0
21       0
22       0
23       0
24       0
25       0
26       2
27       0
28       0
29       0
        ..
48865    2
48866    2
48867    2
48868    2
48869    2
48870    2
48871    2
48872    2
48873    2
48874    2
48875    2
48876    2
48877    2
48878    2
48879    2
48880    2
48881    2
48882    2
48883    2
48884    2
48885    2
48886    2
48887    2
48888    2
48889    2
48890    2
48891    2
48892    2
48893    2
48894    2
Length: 48895, dtype: int64

In [190]:
airbnb.isna()

Unnamed: 0,id,name,host_id,host_name,boroughs,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
3,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [191]:
airbnb.isna().sum()

id                                    0
name                                 16
host_id                               0
host_name                            21
boroughs                              0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                     0
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [11]:
# View statistical summary of data
print(airbnb.describe())

                 id       host_id      latitude     longitude         price  \
count  4.889500e+04  4.889500e+04  48895.000000  48895.000000  48895.000000   
mean   1.901714e+07  6.762001e+07     40.728949    -73.952170    152.720687   
std    1.098311e+07  7.861097e+07      0.054530      0.046157    240.154170   
min    2.539000e+03  2.438000e+03     40.499790    -74.244420      0.000000   
25%    9.471945e+06  7.822033e+06     40.690100    -73.983070     69.000000   
50%    1.967728e+07  3.079382e+07     40.723070    -73.955680    106.000000   
75%    2.915218e+07  1.074344e+08     40.763115    -73.936275    175.000000   
max    3.648724e+07  2.743213e+08     40.913060    -73.712990  10000.000000   

       minimum_nights  number_of_reviews  reviews_per_month  \
count    48895.000000       48895.000000       38843.000000   
mean         7.029962          23.274466           1.373221   
std         20.510550          44.550582           1.680442   
min          1.000000           0.00

In [17]:
# View a part of actual data (10 rows)
airbnb.head(10)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0
5,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129
6,5121,BlissArtsSpace!,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68688,-73.95596,Private room,60,45,49,2017-10-05,0.4,1,0
7,5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Hell's Kitchen,40.76489,-73.98493,Private room,79,2,430,2019-06-24,3.47,1,220
8,5203,Cozy Clean Guest Room - Family Apt,7490,MaryEllen,Manhattan,Upper West Side,40.80178,-73.96723,Private room,79,2,118,2017-07-21,0.99,1,0
9,5238,Cute & Cozy Lower East Side 1 bdrm,7549,Ben,Manhattan,Chinatown,40.71344,-73.99037,Entire home/apt,150,1,160,2019-06-09,1.33,4,188


In [19]:
airbnb.tail(10)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
48885,36482809,Stunning Bedroom NYC! Walking to Central Park!!,131529729,Kendall,Manhattan,East Harlem,40.79633,-73.93605,Private room,75,2,0,,,2,353
48886,36483010,Comfy 1 Bedroom in Midtown East,274311461,Scott,Manhattan,Midtown,40.75561,-73.96723,Entire home/apt,200,6,0,,,1,176
48887,36483152,Garden Jewel Apartment in Williamsburg New York,208514239,Melki,Brooklyn,Williamsburg,40.71232,-73.9422,Entire home/apt,170,1,0,,,3,365
48888,36484087,"Spacious Room w/ Private Rooftop, Central loca...",274321313,Kat,Manhattan,Hell's Kitchen,40.76392,-73.99183,Private room,125,4,0,,,1,31
48889,36484363,QUIT PRIVATE HOUSE,107716952,Michael,Queens,Jamaica,40.69137,-73.80844,Private room,65,1,0,,,2,163
48890,36484665,Charming one bedroom - newly renovated rowhouse,8232441,Sabrina,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,,,2,9
48891,36485057,Affordable room in Bushwick/East Williamsburg,6570630,Marisol,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,,,2,36
48892,36485431,Sunny Studio at Historical Neighborhood,23492952,Ilgar & Aysel,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,,,1,27
48893,36485609,43rd St. Time Square-cozy single bed,30985759,Taz,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,,,6,2
48894,36487245,Trendy duplex in the very heart of Hell's Kitchen,68119814,Christophe,Manhattan,Hell's Kitchen,40.76404,-73.98933,Private room,90,7,0,,,1,23


In [14]:
# check out the size of airbnb data
print(len(airbnb))

48895


In [21]:
# format the output to be user friendly
print('The size of the NYC airbnb data is ' + str(len(airbnb)) + '.')
print('The size of the NYC airbnb data is',len(airbnb),'.')

The size of the NYC airbnb data is 48895.
The size of the NYC airbnb data is 48895 .


In [22]:
# list the column names of the airbnb data
# Reference: https://www.geeksforgeeks.org/how-to-get-column-names-in-pandas-dataframe/

print(airbnb.columns)

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')


In [23]:
# iterate the columns so that the list can be displayed nicely
for col in airbnb.columns: 
    print(col) 

id
name
host_id
host_name
neighbourhood_group
neighbourhood
latitude
longitude
room_type
price
minimum_nights
number_of_reviews
last_review
reviews_per_month
calculated_host_listings_count
availability_365


In [24]:
# casting the array into list and then display
airbnb_column_list = list(airbnb.columns)
print(airbnb_column_list)

['id', 'name', 'host_id', 'host_name', 'neighbourhood_group', 'neighbourhood', 'latitude', 'longitude', 'room_type', 'price', 'minimum_nights', 'number_of_reviews', 'last_review', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']


In [25]:
# showing list values - only for Jupyter notebook
airbnb_column_list

['id',
 'name',
 'host_id',
 'host_name',
 'neighbourhood_group',
 'neighbourhood',
 'latitude',
 'longitude',
 'room_type',
 'price',
 'minimum_nights',
 'number_of_reviews',
 'last_review',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

# Glancing at Data

In [27]:
# change column names and index of records
# Reference: https://www.geeksforgeeks.org/python-change-column-names-and-row-indexes-in-pandas-dataframe/

# changing a single column name
airbnb = airbnb.rename(columns = {'neighbourhood_group':'boroughs'})
airbnb_column_list = list(airbnb.columns)
airbnb_column_list

['id',
 'name',
 'host_id',
 'host_name',
 'boroughs',
 'neighbourhood',
 'latitude',
 'longitude',
 'room_type',
 'price',
 'minimum_nights',
 'number_of_reviews',
 'last_review',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

In [30]:
# sort records by values in a column
# Reference: https://www.geeksforgeeks.org/sorting-rows-in-pandas-dataframe/

# sort the dataframe's records by column boroughs
# in ascending order
airbnb_by_borough = airbnb.sort_values(by = 'boroughs', ascending = 1)
airbnb_by_borough.head(10)

Unnamed: 0,id,name,host_id,host_name,boroughs,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
7704,5824543,Private room for 2 (10 min to city) - Females ...,30232055,Qasim,Bronx,Longwood,40.82209,-73.90086,Private room,80,1,2,2019-04-30,0.32,1,255
45457,34756976,Gigi’s Room,74633496,Justine,Bronx,University Heights,40.85689,-73.9091,Private room,40,5,0,,,5,362
32797,25833266,Huge private & cozy room in the Bronx!,194102474,Digna,Bronx,Claremont Village,40.84346,-73.91151,Private room,35,1,49,2019-01-23,3.84,1,168
24924,19974905,Esteem's Place,141615596,Esteem,Bronx,Parkchester,40.83805,-73.85867,Shared room,26,1,18,2019-04-23,0.78,2,342
32784,25816034,Bronx 2 Bedroom with View of Manhattan Skyline,179677211,Bettina,Bronx,Van Nest,40.84778,-73.86146,Entire home/apt,150,1,29,2019-06-03,2.23,3,24
32781,25815620,Bronx 1 bedroom studio apartment,179677211,Bettina,Bronx,Van Nest,40.84787,-73.86177,Entire home/apt,107,1,47,2019-06-23,3.63,3,8
36225,28810799,Spacious Comfy livingroom Lounge spot in Bronx...,217290804,Robert,Bronx,Morris Park,40.85661,-73.85281,Shared room,40,1,2,2018-10-29,0.23,1,88
45477,34781680,"A lovely place of Zen, sunny, clean & comforta...",41926423,Rl,Bronx,Longwood,40.81795,-73.91178,Shared room,85,1,0,,,1,77
45478,34782691,Mott Haven Dorm EE,30509656,Orit,Bronx,Port Morris,40.80868,-73.93015,Shared room,28,1,3,2019-06-21,2.65,8,362
8733,6705604,Beautiful Private Bedroom in NY,35103293,Maria,Bronx,Norwood,40.87225,-73.88748,Private room,33,7,40,2018-11-27,0.82,1,0


In [31]:
# see the end of the sorted data
airbnb_by_borough.tail(10)

Unnamed: 0,id,name,host_id,host_name,boroughs,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
32530,25501491,"Great apartment in staten island, New York.",192449075,Alarape,Staten Island,Graniteville,40.62439,-74.16634,Entire home/apt,115,1,36,2019-06-29,2.67,1,263
23420,18958323,Tranquility,21190402,David,Staten Island,Emerson Hill,40.60577,-74.12819,Private room,85,1,0,,,1,0
5387,3892700,Cozy 1BR-Country living in NYC!,46969,Aurelie,Staten Island,Clifton,40.6205,-74.07451,Entire home/apt,70,2,242,2019-06-30,4.09,1,256
23449,18993379,Home &Garden: lots local+fast 2 Manhattan/Broo...,132266502,Sharlene,Staten Island,Westerleigh,40.61357,-74.13566,Entire home/apt,103,3,1,2018-07-29,0.09,1,189
19899,15949915,"Quiet, Private 3.5 rooms, Manhattan convenient",103545877,Jon,Staten Island,Oakwood,40.56233,-74.12673,Entire home/apt,100,2,2,2017-01-30,0.06,1,36
23460,18997371,Cozy Getaway,90104417,Sueann,Staten Island,Tottenville,40.50873,-74.23914,Entire home/apt,85,2,49,2019-07-01,2.08,2,159
43687,33804180,Cozy farmhouse in NYC. 2 bdrms; 2 full baths. ...,163169045,Janice,Staten Island,Prince's Bay,40.52293,-74.21238,Entire home/apt,85,2,8,2019-06-24,3.33,1,66
45774,34945914,"""Dave's Island Suite""",258232863,David,Staten Island,New Springville,40.58647,-74.15954,Entire home/apt,68,2,8,2019-06-30,5.33,1,4
10874,8387338,Private Room Available in 2BedApt,44195923,Dawn,Staten Island,Tompkinsville,40.63378,-74.08726,Private room,50,1,0,,,1,0
19778,15833574,Delightful studio apartment.,102526590,Edward,Staten Island,Castleton Corners,40.62063,-74.13001,Entire home/apt,65,2,52,2019-06-30,1.95,1,40


In [37]:
# sort rows by Boroughs and Neighbourhood
# in ascending order (fixed)
airbnb_by_borough_neighbourhood = airbnb.sort_values(by = ['boroughs','neighbourhood'])
airbnb_by_borough_neighbourhood.head(10)

Unnamed: 0,id,name,host_id,host_name,boroughs,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
510,182177,A PRIVATE FLAT / APARTMENT- $SPECIAL$,873273,Christian & Carla,Bronx,Allerton,40.86466,-73.85709,Entire home/apt,125,2,271,2019-06-20,2.84,2,347
1167,498120,Hi Traveler.. welcome,2459648,Ellen,Bronx,Allerton,40.8687,-73.8524,Private room,35,7,2,2018-07-23,0.17,1,90
1578,715270,2 Beds/Queen & Full Beautiful Room 40 minsT.Sq...,3684360,Enrique,Bronx,Allerton,40.85956,-73.87067,Private room,39,2,169,2019-06-12,2.07,4,306
1666,755528,PRIVATE BATH/TONS OF SUNLIGHT/SAFE,3684360,Enrique,Bronx,Allerton,40.8584,-73.86969,Entire home/apt,49,2,189,2019-06-23,2.32,4,238
1706,773041,Nice beautiful room In the Bronx,3684360,Enrique,Bronx,Allerton,40.85914,-73.86979,Private room,38,1,187,2019-06-23,2.34,4,241
4795,3400359,Awesome Deal NYC,16286162,Pat,Bronx,Allerton,40.86677,-73.85938,Private room,49,2,114,2019-06-29,1.87,4,240
4826,3429765,Sunny Private Room,16286162,Pat,Bronx,Allerton,40.86689,-73.85776,Private room,47,2,75,2018-07-05,1.27,4,172
6023,4407790,Retreat Room,16286162,Pat,Bronx,Allerton,40.86682,-73.85812,Private room,49,2,42,2019-06-21,0.74,4,227
6097,4462008,Twin room,16286162,Pat,Bronx,Allerton,40.86814,-73.85874,Private room,47,2,97,2019-06-12,1.81,4,262
7018,5046189,Modern/Spacious 4BR +Tent Perfect for LARGE gr...,23732730,Buddy,Bronx,Allerton,40.87054,-73.84681,Entire home/apt,450,2,45,2019-07-02,0.84,4,342


In [44]:
# we want to show na values first
airbnb_by_borough_neighbourhood = airbnb.sort_values(by = ['last_review'], na_position = 'first')
airbnb_by_borough_neighbourhood.head(10)

Unnamed: 0,id,name,host_id,host_name,boroughs,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
19,7750,Huge 2 BR Upper East Cental Park,17985,Sing,Manhattan,East Harlem,40.79685,-73.94872,Entire home/apt,190,7,0,,,2,249
26,8700,Magnifique Suite au N de Manhattan - vue Cloitres,26394,Claude & Sophie,Manhattan,Inwood,40.86754,-73.92639,Private room,80,4,0,,,1,0
36,11452,Clean and Quiet in Brooklyn,7355,Vt,Brooklyn,Bedford-Stuyvesant,40.68876,-73.94312,Private room,35,60,0,,,1,365
38,11943,Country space in the city,45445,Harriet,Brooklyn,Flatbush,40.63702,-73.96327,Private room,150,1,0,,,1,365
193,51438,1 Bedroom in 2 Bdrm Apt- Upper East,236421,Jessica,Manhattan,Upper East Side,40.77333,-73.95199,Private room,130,14,0,,,2,0
204,54466,Beautiful Uptown Manhattan apartmnt,253385,Douglas,Manhattan,Harlem,40.80234,-73.95603,Private room,200,30,0,,,1,365
260,63588,LL3,295128,Carol Gloria,Bronx,Clason Point,40.81309,-73.85514,Private room,90,2,0,,,7,349
265,63913,"HOSTING YOUR SUNNY, SPACIOUS NYC ROOM",312288,Paula,Manhattan,Inwood,40.86648,-73.9263,Private room,75,7,0,,,2,323
267,64015,Prime East Village 1 Bedroom,146944,David,Manhattan,East Village,40.72807,-73.98594,Entire home/apt,200,3,0,,,1,0


# Data Wrangling

In [47]:
# We can change the value of NaN to 0.
# first check the number of NaN
airbnb['reviews_per_month'].isnull().sum()

10052

In [48]:
# change the value of NaN to 0
airbnb.fillna({'reviews_per_month':0},inplace=True)

In [49]:
# check the number of NaN again
airbnb['reviews_per_month'].isnull().sum()

0

In [50]:
# We can figure out the unique values of boroughs (originally neighbourhood_group)
airbnb['boroughs'].count()

48895

In [51]:
airbnb['boroughs'].unique()

array(['Brooklyn', 'Manhattan', 'Queens', 'Staten Island', 'Bronx'],
      dtype=object)

In [52]:
# displaying it nicely
print(airbnb['boroughs'].unique())

['Brooklyn' 'Manhattan' 'Queens' 'Staten Island' 'Bronx']


In [55]:
# We can do the same for the neighbourhood
print(airbnb['neighbourhood'].unique())

['Kensington' 'Midtown' 'Harlem' 'Clinton Hill' 'East Harlem'
 'Murray Hill' 'Bedford-Stuyvesant' "Hell's Kitchen" 'Upper West Side'
 'Chinatown' 'South Slope' 'West Village' 'Williamsburg' 'Fort Greene'
 'Chelsea' 'Crown Heights' 'Park Slope' 'Windsor Terrace' 'Inwood'
 'East Village' 'Greenpoint' 'Bushwick' 'Flatbush' 'Lower East Side'
 'Prospect-Lefferts Gardens' 'Long Island City' 'Kips Bay' 'SoHo'
 'Upper East Side' 'Prospect Heights' 'Washington Heights' 'Woodside'
 'Brooklyn Heights' 'Carroll Gardens' 'Gowanus' 'Flatlands' 'Cobble Hill'
 'Flushing' 'Boerum Hill' 'Sunnyside' 'DUMBO' 'St. George' 'Highbridge'
 'Financial District' 'Ridgewood' 'Morningside Heights' 'Jamaica'
 'Middle Village' 'NoHo' 'Ditmars Steinway' 'Flatiron District'
 'Roosevelt Island' 'Greenwich Village' 'Little Italy' 'East Flatbush'
 'Tompkinsville' 'Astoria' 'Clason Point' 'Eastchester' 'Kingsbridge'
 'Two Bridges' 'Queens Village' 'Rockaway Beach' 'Forest Hills' 'Nolita'
 'Woodlawn' 'University Heights' '

# Dealing with Rows and Columns

In [21]:
# Reference: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html

# Dealing with rows and columns are fundamental to data exploration process
# Let's work with a demo dataset for now.  

# Define an employee dataset
employee_data = {'Name':['Jai', 'Princi', 'Gaurav', 'Anuj'], 
        'Age':[27, 24, 22, 32], 
        'Address':['Delhi', 'Kanpur', 'Allahabad', 'Kannauj'], 
        'Qualification':['Msc', 'MA', 'MCA', 'Phd']} 
  
# Convert the dictionary into DataFrame  
employee_df = pd.DataFrame(employee_data)

In [22]:
# showing 2 columns: 'Name' and 'Qualification'
print(employee_df[['Name','Qualification']])

# Note that we need [[]] because Python is an old language.  
# The array subscript is defined to take only one pointer.
# So, we first have to pack 'Name' and 'Qualification' in one single unit
# That's why we need the first []
# The second [] is used for telling Python that we are going to select a subset of dataframe.

     Name Qualification
0     Jai           Msc
1  Princi            MA
2  Gaurav           MCA
3    Anuj           Phd


In [23]:
# If we don't want to write a lot all the time to represent this data containing these 2 columns
# We can copy this to a new dataframe variable
employee_name_qualification_df = employee_df[['Name','Qualification']]
print(employee_name_qualification_df)

     Name Qualification
0     Jai           Msc
1  Princi            MA
2  Gaurav           MCA
3    Anuj           Phd


In [24]:
# The employee_df is still kept intact
print(employee_df)

     Name  Age    Address Qualification
0     Jai   27      Delhi           Msc
1  Princi   24     Kanpur            MA
2  Gaurav   22  Allahabad           MCA
3    Anuj   32    Kannauj           Phd


In [25]:
# If we make changes to the employee_name_qualification_df), see what happen to employee_df
print(employee_name_qualification_df['Name'][0])

Jai


In [27]:
employee_name_qualification_df['Name'][0] = 'Ja'

In [67]:
print(employee_df)

     Name  Age    Address Qualification
0     Jai   27      Delhi           Msc
1  Princi   24     Kanpur            MA
2  Gaurav   22  Allahabad           MCA
3    Anuj   32    Kannauj           Phd


In [69]:
print(employee_name_qualification_df)

     Name Qualification
0      Ja           Msc
1  Princi            MA
2  Gaurav           MCA
3    Anuj           Phd


In [70]:
employee_df.loc[0]

Name               Jai
Age                 27
Address          Delhi
Qualification      Msc
Name: 0, dtype: object

In [97]:
print(employee_df.loc[2])

Name                Gaurav
Age                     22
Address          Allahabad
Qualification          MCA
Name: 2, dtype: object


In [72]:
employee_df.iloc[2]

Name                Gaurav
Age                     22
Address          Allahabad
Qualification          MCA
Name: 2, dtype: object

In [73]:
employee_df.iloc[[0,2,1]]

Unnamed: 0,Name,Age,Address,Qualification
0,Jai,27,Delhi,Msc
2,Gaurav,22,Allahabad,MCA
1,Princi,24,Kanpur,MA


In [99]:
employee_df[1:3]

Unnamed: 0,Name,Age,Address,Qualification
1,Princi,24,Kanpur,MA
2,Gaurav,22,Allahabad,MCA


In [101]:
employee_df[1:2]

Unnamed: 0,Name,Age,Address,Qualification
1,Princi,24,Kanpur,MA


In [102]:
employee_df['Age']

0    27
1    24
2    22
3    32
Name: Age, dtype: int64

In [87]:
employee_df['Age'][1:3]

1    24
2    22
Name: Age, dtype: int64

In [88]:
employee_df[1:3]['Age']

1    24
2    22
Name: Age, dtype: int64

In [89]:
employee_df[['Age','Name']]

Unnamed: 0,Age,Name
0,27,Jai
1,24,Princi
2,22,Gaurav
3,32,Anuj


In [103]:
employee_df[['Name','Age']][0:2]

Unnamed: 0,Name,Age
0,Jai,27
1,Princi,24


# Selecting with Conditions

In [28]:
# Reference: https://www.geeksforgeeks.org/selecting-rows-in-pandas-dataframe-based-on-conditions/

# create a sample dataframe
student_record = { 
  
 'Name': ['Ankit', 'Amit', 'Aishwarya', 'Priyanka', 'Priya', 'Shaurya' ], 
 'Age': [21, 19, 20, 18, 17, 21], 
 'Subject': ['Math', 'Commerce', 'Science', 'Math', 'Math', 'Science'], 
 'Percentage': [88, 92, 95, 70, 65, 78] } 
  
# create a dataframe 
student_df = pd.DataFrame(student_record, columns = ['Name', 'Age', 'Subject', 'Percentage']) 
  
print("Given Dataframe :\n", student_df)  
 

Given Dataframe :
         Name  Age   Subject  Percentage
0      Ankit   21      Math          88
1       Amit   19  Commerce          92
2  Aishwarya   20   Science          95
3   Priyanka   18      Math          70
4      Priya   17      Math          65
5    Shaurya   21   Science          78


In [29]:
# Select only students or records that have percentage > 80 (got an A)

A_student_df = student_df[student_df['Percentage'] > 80]
A_student_df

Unnamed: 0,Name,Age,Subject,Percentage
0,Ankit,21,Math,88
1,Amit,19,Commerce,92
2,Aishwarya,20,Science,95


In [30]:
A_student_df = student_df[student_df['Percentage'] == 95]
A_student_df

Unnamed: 0,Name,Age,Subject,Percentage
2,Aishwarya,20,Science,95


In [31]:
# Selecting with AND conditions
# Reference: https://www.geeksforgeeks.org/selecting-rows-in-pandas-dataframe-based-on-conditions/

B_student_df = student_df[(student_df['Percentage'] < 80) & (student_df['Percentage'] >= 70)]

B_student_df

Unnamed: 0,Name,Age,Subject,Percentage
3,Priyanka,18,Math,70
5,Shaurya,21,Science,78


In [48]:
# Selecting rows based on condition
options = ['Math','Commerce']

Math_Commerce_students_df = student_df[student_df['Subject'].isin(options)]

Math_Commerce_students_df

Unnamed: 0,Name,Age,Subject,Percentage
0,Ankit,21,Math,88
1,Amit,19,Commerce,92
3,Priyanka,18,Math,70
4,Priya,17,Math,65


In [49]:
# We can also use loc to identify the rows - loc is used with non-number-indexes
options = ['Math','Commerce']

Math_Commerce_students_df = student_df.loc[student_df['Subject'].isin(options)]

Math_Commerce_students_df

Unnamed: 0,Name,Age,Subject,Percentage
0,Ankit,21,Math,88
1,Amit,19,Commerce,92
3,Priyanka,18,Math,70
4,Priya,17,Math,65


In [50]:
# We want to select students who do not take Math and Commerce
options = ['Math','Commerce']

Math_Commerce_students_df = student_df[~student_df['Subject'].isin(options)]

Math_Commerce_students_df

Unnamed: 0,Name,Age,Subject,Percentage
2,Aishwarya,20,Science,95
5,Shaurya,21,Science,78


In [51]:
# We want to see students who take Math and have score less than 70
options = ['Math']

C_Math_Commerce_students_df = student_df[(student_df['Subject'].isin(options))&(student_df['Percentage']<70)]

C_Math_Commerce_students_df

Unnamed: 0,Name,Age,Subject,Percentage
4,Priya,17,Math,65


In [59]:
# Swapping Columns
# Sometimes we want to swap columns for easy reading

Math_Commerce_student_df['Name']

NameError: name 'Math_Commerce_student_df' is not defined

# Adding new Columns to Dataframe

In [54]:
# Reference: https://www.geeksforgeeks.org/adding-new-column-to-existing-dataframe-in-pandas/

# Recall student score data
student_df

Unnamed: 0,Name,Age,Subject,Percentage
0,Ankit,21,Math,88
1,Amit,19,Commerce,92
2,Aishwarya,20,Science,95
3,Priyanka,18,Math,70
4,Priya,17,Math,65
5,Shaurya,21,Science,78


In [56]:
# We want to add grade to the dataframe

student_grades = ['A','A','A','B','C','B']
student_grades

['A', 'A', 'A', 'B', 'C', 'B']

In [57]:
# We can add student grades as a new column by just add the dataframe to the student_df with new column name
student_df['Grade'] = student_grades

student_df

Unnamed: 0,Name,Age,Subject,Percentage,Grade
0,Ankit,21,Math,88,A
1,Amit,19,Commerce,92,A
2,Aishwarya,20,Science,95,A
3,Priyanka,18,Math,70,B
4,Priya,17,Math,65,C
5,Shaurya,21,Science,78,B


# Data Transformation

In [60]:
# Say a teacher would like to add bonus score of 5 to all students.  
# The transformation can be performed using transform function as follows
student_w_bonus_df = student_df.copy()

student_w_bonus_df['Percentage'] = student_w_bonus_df['Percentage'] + 5
student_w_bonus_df

Unnamed: 0,Name,Age,Subject,Percentage,Grade
0,Ankit,21,Math,93,A
1,Amit,19,Commerce,97,A
2,Aishwarya,20,Science,100,A
3,Priyanka,18,Math,75,B
4,Priya,17,Math,70,C
5,Shaurya,21,Science,83,B


In [126]:
# We can also take a specific function to a column
# Reference: https://stackoverflow.com/questions/42007318/pandas-apply-a-specific-function-to-columns-and-create-other-columns

student_wo_bonus_df = student_df.copy()

# Define a function / procedure to act on the input value
# In this case, reduce the value by 5
def foo(x):
    return x-5

student_wo_bonus_df['Percentage'] = student_wo_bonus_df.apply(lambda x: foo(x['Percentage']),axis = 1)

# show the result dataframe
student_wo_bonus_df

Unnamed: 0,Name,Age,Subject,Percentage,Grade
0,Ankit,21,Math,83,A
1,Amit,19,Commerce,87,A
2,Aishwarya,20,Science,90,A
3,Priyanka,18,Math,65,B
4,Priya,17,Math,60,C
5,Shaurya,21,Science,73,B


In [133]:
# We would like to drop/delete some rows with condition
# Usually we don't recommend any deletion on the original dataset
# We recommend copy a selcted slice of dataset

temp_df = student_df.copy()

# Get the index of the rows that meet the condition
dropindex = temp_df[temp_df['Percentage'] < 80].index

# Delete these row indexes from the dataframe
temp_df.drop(dropindex , inplace=True)

# show the result
temp_df

Unnamed: 0,Name,Age,Subject,Percentage,Grade
0,Ankit,21,Math,88,A
1,Amit,19,Commerce,92,A
2,Aishwarya,20,Science,95,A


In [62]:
# Dropping a column can be done as follows

temp2_df = student_df.copy()

temp2_df.drop('Grade', axis=1)

temp2_df

Unnamed: 0,Name,Age,Subject,Percentage,Grade
0,Ankit,21,Math,88,A
1,Amit,19,Commerce,92,A
2,Aishwarya,20,Science,95,A
3,Priyanka,18,Math,70,B
4,Priya,17,Math,65,C
5,Shaurya,21,Science,78,B


In [63]:
# We see that nothing is changed because 
# the drop method does not make any change to the original dataframe
# You will need to assign the variable holder to receive the result

# This is a *preferred* way of dropping a column from a dataframe
temp2_df = student_df.copy()

temp2_df = temp2_df.drop('Grade', axis=1)

temp2_df

Unnamed: 0,Name,Age,Subject,Percentage
0,Ankit,21,Math,88
1,Amit,19,Commerce,92
2,Aishwarya,20,Science,95
3,Priyanka,18,Math,70
4,Priya,17,Math,65
5,Shaurya,21,Science,78


In [141]:
# We can drop multiple columns as follows
temp2_df = student_df.copy()

temp2_df = temp2_df.drop(['Percentage','Grade'], axis=1)

temp2_df

Unnamed: 0,Name,Age,Subject
0,Ankit,21,Math
1,Amit,19,Commerce
2,Aishwarya,20,Science
3,Priyanka,18,Math
4,Priya,17,Math
5,Shaurya,21,Science


In [142]:
# A cleaner way to do this is as follows
temp2_df = student_df.copy()

columns_to_drop = ['Percentage', 'Grade']
temp2_df = temp2_df.drop(columns_to_drop, axis=1)

temp2_df

Unnamed: 0,Name,Age,Subject
0,Ankit,21,Math
1,Amit,19,Commerce
2,Aishwarya,20,Science
3,Priyanka,18,Math
4,Priya,17,Math
5,Shaurya,21,Science


In [147]:
# We can also drop multiple columns using indexes
# Reference: https://stackoverflow.com/questions/13411544/delete-column-from-pandas-dataframe

temp2_df = student_df.copy()

temp2_df = temp2_df.drop(temp2_df.columns[[3,4]], axis=1)

temp2_df

Unnamed: 0,Name,Age,Subject
0,Ankit,21,Math
1,Amit,19,Commerce
2,Aishwarya,20,Science
3,Priyanka,18,Math
4,Priya,17,Math
5,Shaurya,21,Science


In [146]:
# Revisit the grade generation
# We can do this

def calculate_grade(p):
    grade = ''
    if (p >= 80):
        grade = 'A'
    elif ((p < 80) & (p >= 70)):
        grade = 'B'
    elif ((p < 70) & (p >= 60)):
        grade = 'C'
    else:
        grade = 'F'
    return grade

temp_df = student_df.copy()

# remove the 'Grade column'
temp_df = temp_df.drop('Grade', axis=1)
print(temp_df)

# add the 'Grade column using our method'

temp_df['Grade'] = temp_df.apply(lambda x: calculate_grade(x['Percentage']),axis = 1)

temp_df

        Name  Age   Subject  Percentage
0      Ankit   21      Math          88
1       Amit   19  Commerce          92
2  Aishwarya   20   Science          95
3   Priyanka   18      Math          70
4      Priya   17      Math          65
5    Shaurya   21   Science          78


Unnamed: 0,Name,Age,Subject,Percentage,Grade
0,Ankit,21,Math,88,A
1,Amit,19,Commerce,92,A
2,Aishwarya,20,Science,95,A
3,Priyanka,18,Math,70,B
4,Priya,17,Math,65,C
5,Shaurya,21,Science,78,B


In [152]:
# Back to airbnb data
# We can select neighborhood inside/associated with a specific borough (e.g. Manhattan)

print(airbnb.columns)
print(airbnb['boroughs'].unique())

['Brooklyn' 'Manhattan' 'Queens' 'Staten Island' 'Bronx']


In [163]:
airbnb_manhattan = airbnb[airbnb['boroughs'] == 'Manhattan'].copy()
airbnb_manhattan

Unnamed: 0,id,name,host_id,host_name,boroughs,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,0.00,1,365
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0
5,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.97500,Entire home/apt,200,3,74,2019-06-22,0.59,1,129
7,5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Hell's Kitchen,40.76489,-73.98493,Private room,79,2,430,2019-06-24,3.47,1,220
8,5203,Cozy Clean Guest Room - Family Apt,7490,MaryEllen,Manhattan,Upper West Side,40.80178,-73.96723,Private room,79,2,118,2017-07-21,0.99,1,0
9,5238,Cute & Cozy Lower East Side 1 bdrm,7549,Ben,Manhattan,Chinatown,40.71344,-73.99037,Entire home/apt,150,1,160,2019-06-09,1.33,4,188
10,5295,Beautiful 1br on Upper West Side,7702,Lena,Manhattan,Upper West Side,40.80316,-73.96545,Entire home/apt,135,5,53,2019-06-22,0.43,1,6
11,5441,Central Manhattan/near Broadway,7989,Kate,Manhattan,Hell's Kitchen,40.76076,-73.98867,Private room,85,2,188,2019-06-23,1.50,1,39
13,6021,Wonderful Guest Bedroom in Manhattan for SINGLES,11528,Claudio,Manhattan,Upper West Side,40.79826,-73.96113,Private room,85,2,113,2019-07-05,0.91,1,333


In [164]:
# count number of airbnb records related to Manhattan
print(airbnb_manhattan.count())

id                                21661
name                              21652
host_id                           21661
host_name                         21652
boroughs                          21661
neighbourhood                     21661
latitude                          21661
longitude                         21661
room_type                         21661
price                             21661
minimum_nights                    21661
number_of_reviews                 21661
last_review                       16632
reviews_per_month                 21661
calculated_host_listings_count    21661
availability_365                  21661
dtype: int64


In [167]:
# Resetting Index
# We can see that the index is not sequenced correctly once we sliced the dataframe
# We need to reset the index
# reset_index by default does not modify the DataFrame; it returns a new DataFrame with the reset index. 
# If you want to modify the original, use the inplace argument
# Reference: https://stackoverflow.com/questions/27736267/pandas-reset-index-doesnt-seem-to-work

new_airbnb_manhattan = airbnb_manhattan.reset_index(drop = True)

# show the results
new_airbnb_manhattan

Unnamed: 0,id,name,host_id,host_name,boroughs,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
1,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,0.00,1,365
2,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0
3,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.97500,Entire home/apt,200,3,74,2019-06-22,0.59,1,129
4,5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Hell's Kitchen,40.76489,-73.98493,Private room,79,2,430,2019-06-24,3.47,1,220
5,5203,Cozy Clean Guest Room - Family Apt,7490,MaryEllen,Manhattan,Upper West Side,40.80178,-73.96723,Private room,79,2,118,2017-07-21,0.99,1,0
6,5238,Cute & Cozy Lower East Side 1 bdrm,7549,Ben,Manhattan,Chinatown,40.71344,-73.99037,Entire home/apt,150,1,160,2019-06-09,1.33,4,188
7,5295,Beautiful 1br on Upper West Side,7702,Lena,Manhattan,Upper West Side,40.80316,-73.96545,Entire home/apt,135,5,53,2019-06-22,0.43,1,6
8,5441,Central Manhattan/near Broadway,7989,Kate,Manhattan,Hell's Kitchen,40.76076,-73.98867,Private room,85,2,188,2019-06-23,1.50,1,39
9,6021,Wonderful Guest Bedroom in Manhattan for SINGLES,11528,Claudio,Manhattan,Upper West Side,40.79826,-73.96113,Private room,85,2,113,2019-07-05,0.91,1,333


In [169]:
# current airbnb_manhattan
airbnb_manhattan

Unnamed: 0,id,name,host_id,host_name,boroughs,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,0.00,1,365
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0
5,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.97500,Entire home/apt,200,3,74,2019-06-22,0.59,1,129
7,5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Hell's Kitchen,40.76489,-73.98493,Private room,79,2,430,2019-06-24,3.47,1,220
8,5203,Cozy Clean Guest Room - Family Apt,7490,MaryEllen,Manhattan,Upper West Side,40.80178,-73.96723,Private room,79,2,118,2017-07-21,0.99,1,0
9,5238,Cute & Cozy Lower East Side 1 bdrm,7549,Ben,Manhattan,Chinatown,40.71344,-73.99037,Entire home/apt,150,1,160,2019-06-09,1.33,4,188
10,5295,Beautiful 1br on Upper West Side,7702,Lena,Manhattan,Upper West Side,40.80316,-73.96545,Entire home/apt,135,5,53,2019-06-22,0.43,1,6
11,5441,Central Manhattan/near Broadway,7989,Kate,Manhattan,Hell's Kitchen,40.76076,-73.98867,Private room,85,2,188,2019-06-23,1.50,1,39
13,6021,Wonderful Guest Bedroom in Manhattan for SINGLES,11528,Claudio,Manhattan,Upper West Side,40.79826,-73.96113,Private room,85,2,113,2019-07-05,0.91,1,333


In [172]:
# Changing the index of airbnb_manhattan without creating a new dataframe
airbnb_manhattan.reset_index(drop = True, inplace = True)
airbnb_manhattan

Unnamed: 0,id,name,host_id,host_name,boroughs,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
1,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,0.00,1,365
2,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0
3,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.97500,Entire home/apt,200,3,74,2019-06-22,0.59,1,129
4,5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Hell's Kitchen,40.76489,-73.98493,Private room,79,2,430,2019-06-24,3.47,1,220
5,5203,Cozy Clean Guest Room - Family Apt,7490,MaryEllen,Manhattan,Upper West Side,40.80178,-73.96723,Private room,79,2,118,2017-07-21,0.99,1,0
6,5238,Cute & Cozy Lower East Side 1 bdrm,7549,Ben,Manhattan,Chinatown,40.71344,-73.99037,Entire home/apt,150,1,160,2019-06-09,1.33,4,188
7,5295,Beautiful 1br on Upper West Side,7702,Lena,Manhattan,Upper West Side,40.80316,-73.96545,Entire home/apt,135,5,53,2019-06-22,0.43,1,6
8,5441,Central Manhattan/near Broadway,7989,Kate,Manhattan,Hell's Kitchen,40.76076,-73.98867,Private room,85,2,188,2019-06-23,1.50,1,39
9,6021,Wonderful Guest Bedroom in Manhattan for SINGLES,11528,Claudio,Manhattan,Upper West Side,40.79826,-73.96113,Private room,85,2,113,2019-07-05,0.91,1,333


In [171]:
airbnb.head(10)

Unnamed: 0,id,name,host_id,host_name,boroughs,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,0.0,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0
5,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129
6,5121,BlissArtsSpace!,7356,Garon,Brooklyn,Bedford-Stuyvesant,40.68688,-73.95596,Private room,60,45,49,2017-10-05,0.4,1,0
7,5178,Large Furnished Room Near B'way,8967,Shunichi,Manhattan,Hell's Kitchen,40.76489,-73.98493,Private room,79,2,430,2019-06-24,3.47,1,220
8,5203,Cozy Clean Guest Room - Family Apt,7490,MaryEllen,Manhattan,Upper West Side,40.80178,-73.96723,Private room,79,2,118,2017-07-21,0.99,1,0
9,5238,Cute & Cozy Lower East Side 1 bdrm,7549,Ben,Manhattan,Chinatown,40.71344,-73.99037,Entire home/apt,150,1,160,2019-06-09,1.33,4,188


In [173]:
print(airbnb_manhattan['neighbourhood'].unique())

['Midtown' 'Harlem' 'East Harlem' 'Murray Hill' "Hell's Kitchen"
 'Upper West Side' 'Chinatown' 'West Village' 'Chelsea' 'Inwood'
 'East Village' 'Lower East Side' 'Kips Bay' 'SoHo' 'Upper East Side'
 'Washington Heights' 'Financial District' 'Morningside Heights' 'NoHo'
 'Flatiron District' 'Roosevelt Island' 'Greenwich Village' 'Little Italy'
 'Two Bridges' 'Nolita' 'Gramercy' 'Theater District' 'Tribeca'
 'Battery Park City' 'Civic Center' 'Stuyvesant Town' 'Marble Hill']


In [174]:
# count number of unique values
print(airbnb_manhattan['neighbourhood'].nunique())

32


# Text Manipulation

In [186]:
# Reference: 
# https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/
# https://medium.com/factory-mind/regex-tutorial-a-simple-cheatsheet-by-examples-649dc1c3f285
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.replace.html
# https://ryanstutorials.net/regular-expressions-tutorial/regular-expressions-basics.php
# https://www.dataquest.io/blog/regular-expressions-data-scientists/
# https://www.geeksforgeeks.org/replace-values-in-pandas-dataframe-using-regex/
# In many cases, we have to replace some text to make it look appropriate

# Example

# Let's create a Dataframe 
event_df = pd.DataFrame({'City':['new York', 'Parague', 'New Delhi', 'Venice', 'new Orleans'], 
                    'Event':['Music', 'Poetry', 'Theatre', 'Comedy', 'Tech_Summit'], 
                    'Cost':[10000, 5000, 15000, 2000, 12000], 'Period':['02-2018','04-2018','06-2018','10-2018','12-2018']}) 
  
# Let's print the dataframe 
event_df

Unnamed: 0,City,Event,Cost,Period
0,new York,Music,10000,02-2018
1,Parague,Poetry,5000,04-2018
2,New Delhi,Theatre,15000,06-2018
3,Venice,Comedy,2000,10-2018
4,new Orleans,Tech_Summit,12000,12-2018


In [187]:
# We can see that the name of some cities are not correct.  We would like to change n to N.
# We can use regular expression (RegEx) as follows

event_df = event_df.replace(to_replace ='[nN]ew', value = 'New', regex = True)

# print the dataframe
event_df

Unnamed: 0,City,Event,Cost,Period
0,New York,Music,10000,02-2018
1,Parague,Poetry,5000,04-2018
2,New Delhi,Theatre,15000,06-2018
3,Venice,Comedy,2000,10-2018
4,New Orleans,Tech_Summit,12000,12-2018


In [188]:
# Reference: https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/
# Using RegEx match to select only data that contains 'New' in City

event_df_new_city = event_df[event_df['City'].str.match('^New*')==True]

# show result
event_df_new_city

# for more information, please study the references

Unnamed: 0,City,Event,Cost,Period
0,New York,Music,10000,02-2018
2,New Delhi,Theatre,15000,06-2018
4,New Orleans,Tech_Summit,12000,12-2018


# Basic Statistics

In [176]:
airbnb_manhattan_average_price = airbnb_manhattan['price'].mean()
print(airbnb_manhattan_average_price)

196.8758136743456


In [181]:
airbnb_average_reviews_per_month = airbnb['reviews_per_month'].mean()
print(airbnb_average_reviews_per_month)

1.0909099089886491


In [183]:
airbnb_max_reviews_per_month = airbnb['reviews_per_month'].max()
print(airbnb_max_reviews_per_month)

58.5


In [184]:
airbnb_min_reviews_per_month = airbnb['reviews_per_month'].min()
print(airbnb_min_reviews_per_month)

0.0


# Writing the result in a csv file

In [189]:
file_name = 'airbnb_manhattan.csv'
airbnb_manhattan.to_csv(file_name, sep=',', encoding='utf-8', index = False)