In [1]:
# import libraries

import pandas as pd
import locale

In [2]:
# read in csv and show data frame
listings_path = 'Resources-ben/Listings_Cleaned_Sample.csv'
extract_df = pd.read_csv(listings_path)
prices_df = extract_df[['price','review_scores_value','number_of_reviews']]
prices_df.info()
prices_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   price                10000 non-null  object 
 1   review_scores_value  7589 non-null   float64
 2   number_of_reviews    10000 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 234.5+ KB


  extract_df = pd.read_csv(listings_path)


Unnamed: 0,price,review_scores_value,number_of_reviews
0,$25.00,,1
1,$115.00,10.0,6
2,$135.00,10.0,1
3,$69.00,9.0,14
4,$130.00,9.0,22


In [3]:
# remove $ symbol from price column and then update data type for further transformation
prices_df['price'] = prices_df['price'].str.replace('$','')
prices_df['price'] = prices_df['price'].replace(',','',regex=True)
prices_df['price'] = prices_df['price'].astype(float)
prices_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   price                10000 non-null  float64
 1   review_scores_value  7589 non-null   float64
 2   number_of_reviews    10000 non-null  int64  
dtypes: float64(2), int64(1)
memory usage: 234.5 KB


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prices_df['price'] = prices_df['price'].str.replace('$','')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prices_df['price'] = prices_df['price'].replace(',','',regex=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prices_df['price'] = prices_df['price'].astype(float)


In [4]:
# aggregate tracked columns to show the average rating of each price
# sum the number of reviews for each price
clean_prices = prices_df.groupby('price').agg({'review_scores_value':['mean'], 
                                                'number_of_reviews':['sum']}).reset_index()
# join the sum and mean so that the columns are one dimensional
clean_prices.columns = clean_prices.columns.map('_'.join)
clean_prices

Unnamed: 0,price_,review_scores_value_mean,number_of_reviews_sum
0,10.0,,0
1,12.0,,0
2,15.0,9.250000,272
3,16.0,8.666667,21
4,17.0,10.000000,13
...,...,...,...
489,5000.0,10.000000,6
490,5500.0,,0
491,6600.0,,0
492,7500.0,10.000000,2


In [5]:
clean_prices = clean_prices.rename(columns={'price_':'price',
                             'review_scores_value_mean':'avg_review_per_price'})

In [6]:
# create a new column that identifies if the price is groups cheap, affordable, expensive, or extravagent
# create empty price rating list
affordability = []

# if the value fits the condition then append to list
for i in clean_prices['price']:
    if i <= 50.0:
        affordability.append('Cheap $0-49')
    elif i > 50.0 and i <= 200.0:
        affordability.append('affordable $51-200')
    elif i > 200.0 and i <= 500.0:
        affordability.append('expensive $200-500')
    elif i > 500.0:
        affordability.append('very expensive $500+')
    else:
        affordability.append('NA')
    
len(affordability)


494

In [7]:
# add affordability column to prices data frame
prices_2 = clean_prices.assign(affordability=affordability)
prices_2.head()

Unnamed: 0,price,avg_review_per_price,number_of_reviews_sum,affordability
0,10.0,,0,Cheap $0-49
1,12.0,,0,Cheap $0-49
2,15.0,9.25,272,Cheap $0-49
3,16.0,8.666667,21,Cheap $0-49
4,17.0,10.0,13,Cheap $0-49


In [8]:
# aggregate the affordability and the average rating and total ratings
# this is an average of average with regards to the sum of reviews per grouping NOT the number of reviews
# per price point... bring this up in office hours please
afford_rating_df = prices_2.groupby('affordability').agg({'avg_review_per_price':['mean'], 
                                                'number_of_reviews_sum':['sum']}).reset_index()
afford_rating_df.columns = afford_rating_df.columns.map('_'.join)

afford_rating_df

Unnamed: 0,affordability_,avg_review_per_price_mean,number_of_reviews_sum_sum
0,Cheap $0-49,9.260476,19591
1,affordable $51-200,9.506454,166320
2,expensive $200-500,9.457092,19648
3,very expensive $500+,9.537335,2426


In [9]:
afford_rating_df = afford_rating_df.rename(columns={'affordability_':'affordability',
                                'avg_price_review_mean':'affordability_reivew_mean',
                                 'number_of_reviews_sum_sum':'total_reviews_per_affordability'
                                })
afford_rating_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   affordability                    4 non-null      object 
 1   avg_review_per_price_mean        4 non-null      float64
 2   total_reviews_per_affordability  4 non-null      int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 224.0+ bytes


In [10]:
# export to CSV
afford_rating_df.to_csv('data/affordability_vs_rating.csv', index=False)


In [11]:
# use locale module to reset prices column to match original dataframe
locale.setlocale(locale.LC_ALL,'')
locale.getlocale()
# locale.currency(10.0, grouping=True)
# for loop to convert prices
formatted_price = []
for i in prices_2['price']:
    i = locale.currency(i, grouping=True )
    formatted_price.append(i)
    
prices_2['price'] = formatted_price
prices_2['price'] = prices_2['price'].astype(object)
prices_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 494 entries, 0 to 493
Data columns (total 4 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   price                  494 non-null    object 
 1   avg_review_per_price   419 non-null    float64
 2   number_of_reviews_sum  494 non-null    int64  
 3   affordability          494 non-null    object 
dtypes: float64(1), int64(1), object(2)
memory usage: 15.6+ KB


In [12]:
# export to CSV
prices_2.to_csv('data/pricing_vs_reviews.csv', index=False)


In [13]:
# merge prices_2 df with original data on price and keep the affordabiltiy column
new_df = pd.merge(extract_df, prices_2, how='left', on='price')
# new_df.info()

In [14]:
new_df = new_df.drop(columns=['avg_review_per_price','number_of_reviews_sum','Unnamed: 0'])
# new_df.info()

In [15]:
# drop columns that we will not use as a group
# new_df = new_df.drop(columns=['notes', 'transit','space','description', 'access', 'interaction', 'neighbourhood', 'market', 
#                               'country_code', 'country', 'weekly_price', 'monthly_price', 'instant_bookable', 
#                               'cancellation_policy', 'require_guest_profile_picture', 
#                               'require_guest_phone_verification','extra people','maximum_nights','minimum_nights'])
# new_df
# space string
# description string
# notes string
# transit string
# access string
# interaction string
# house_rules string
# street string
# smart_location string
# square_feet int
# weekly_price int
# monthly_price int
# security_deposit int
# cleaning_fee int
# guests_included int
# extra_people int
# minimum_nights int
# maximum_nights int
# instant_bookable string

In [16]:
new_df = new_df[['id','host_id','price','affordability']]
new_df

Unnamed: 0,id,host_id,price,affordability
0,15461810,9464908,$25.00,Cheap $0-49
1,13416156,48449293,$115.00,affordable $51-200
2,9136734,45839570,$135.00,affordable $51-200
3,3378135,171168,$69.00,affordable $51-200
4,4403665,8258320,$130.00,affordable $51-200
...,...,...,...,...
9995,6004509,1283043,$118.00,affordable $51-200
9996,14844618,11294711,$150.00,affordable $51-200
9997,4296506,22304742,$49.00,Cheap $0-49
9998,7572083,39698521,$140.00,affordable $51-200


In [17]:
# export to CSV

new_df.to_csv('data/final_listings_cleaned.csv',index=False)

In [18]:
listings_df = pd.read_csv('data/final_listings_cleaned.csv')
listings_df

Unnamed: 0,id,host_id,price,affordability
0,15461810,9464908,$25.00,Cheap $0-49
1,13416156,48449293,$115.00,affordable $51-200
2,9136734,45839570,$135.00,affordable $51-200
3,3378135,171168,$69.00,affordable $51-200
4,4403665,8258320,$130.00,affordable $51-200
...,...,...,...,...
9995,6004509,1283043,$118.00,affordable $51-200
9996,14844618,11294711,$150.00,affordable $51-200
9997,4296506,22304742,$49.00,Cheap $0-49
9998,7572083,39698521,$140.00,affordable $51-200


In [23]:
reviews = pd.read_csv('data/Reviews.csv')
reviews = reviews.drop(columns=['Unnamed: 0'])

KeyError: "['Unnamed: 0'] not found in axis"

In [22]:
reviews.to_csv('data/Reviews.csv', index=False)

In [24]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   host_id                      10000 non-null  int64  
 1   number_of_reviews            10000 non-null  int64  
 2   first_review                 7709 non-null   object 
 3   last_review                  7713 non-null   object 
 4   review_scores_rating         7610 non-null   float64
 5   review_scores_accuracy       7599 non-null   float64
 6   review_scores_cleanliness    7597 non-null   float64
 7   review_scores_checkin        7592 non-null   float64
 8   review_scores_communication  7600 non-null   float64
 9   review_scores_location       7593 non-null   float64
 10  review_scores_value          7589 non-null   float64
 11  reviews_per_month            7709 non-null   float64
dtypes: float64(8), int64(2), object(2)
memory usage: 937.6+ KB


In [50]:
listings = pd.read_csv('data/listing_location_details.csv')
# listings = listings.drop(columns=['state'])
wack = listings.loc[listings['listing_id']==12523662,:]
wack

Unnamed: 0,listing_id,country,country_code,city,zipcode,neighborhood,latitude,longitude,market,is_location_exact,property_type,name,summary,neighborhood_overview,host_id
3301,12523662,United States,US,Rosemead,91770,South San Gabriel,34.047083,-118.091747,Los Angeles,f,House,Uphill suite / own parking 50% (B),50% off for monthly discount Room (B) 50% o...,"' Our neighborhood is much quiet, less helicop...",67670094


In [48]:
listings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   listing_id             10000 non-null  int64  
 1   country                10000 non-null  object 
 2   country_code           10000 non-null  object 
 3   city                   9997 non-null   object 
 4   zipcode                9880 non-null   object 
 5   neighborhood           10000 non-null  object 
 6   latitude               10000 non-null  float64
 7   longitude              10000 non-null  float64
 8   market                 9969 non-null   object 
 9   is_location_exact      10000 non-null  object 
 10  property_type          10000 non-null  object 
 11  name                   9998 non-null   object 
 12  summary                9748 non-null   object 
 13  neighborhood_overview  5935 non-null   object 
 14  host_id                10000 non-null  int64  
dtypes: 

In [38]:
listings.to_csv('data/listing_location_details.csv', index=False)

In [33]:
rooms = pd.read_csv('data/housing_details_data_cleaned.csv')
rooms = rooms.drop(columns=['Unnamed: 0'])
rooms.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      10000 non-null  int64  
 1   host_id                 10000 non-null  int64  
 2   neighbourhood_cleansed  10000 non-null  object 
 3   property_type           10000 non-null  object 
 4   room_type               10000 non-null  object 
 5   accommodates            10000 non-null  int64  
 6   bathrooms               9970 non-null   float64
 7   bedrooms                9987 non-null   float64
 8   beds                    9980 non-null   float64
 9   bed_type                10000 non-null  object 
 10  amenities               10000 non-null  object 
dtypes: float64(3), int64(3), object(5)
memory usage: 859.5+ KB


In [36]:
rooms.to_csv('data/housing_details_data_cleaned.csv', index=False)

