In [80]:
# importing libaries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# import seaborn as sns
from datetime import datetime

In [81]:
ls

01_airbnb_exploratory_data_analysis.ipynb


## Load Datasets

In [82]:
# load csv
df_1 = pd.read_csv('../data/airnb.csv')
df_2 = pd.read_csv('../data/airnb_desert.csv', encoding="cp1252")
df_3 = pd.read_csv('../data/airnb_luxe.csv', encoding="cp1252")

## Inspecting Dataset

In [83]:
print(f"1. airnb.csv columns: {list(df_1.columns)}")
print(f"2. airnb_desert.csv columns: {list(df_2.columns)}")
print(f"3. airnb_luxe.csv columns: {list(df_3.columns)}")

1. airnb.csv columns: ['Title', 'Detail', 'Date', 'Price(in dollar)', 'Offer price(in dollar)', 'Review and rating', 'Number of bed']
2. airnb_desert.csv columns: ['Desert name', 'Date', 'Price(In dollar)', 'Details', 'Rating']
3. airnb_luxe.csv columns: ['Luxe name', 'Date', 'Price(In dollar)', 'Distance']


In [84]:
print(f"1. airnb.csv size: rows {df_1.shape[0]} and {df_1.shape[1]} columns")
print(f"2. airnb_desert.csv size: rows {df_2.shape[0]} and {df_2.shape[1]} columns")
print(f"3. airnb_luxe.csv size: rows {df_3.shape[0]} and {df_3.shape[1]} columns")

1. airnb.csv size: rows 953 and 7 columns
2. airnb_desert.csv size: rows 280 and 5 columns
3. airnb_luxe.csv size: rows 280 and 4 columns


In [85]:
df_1.dtypes

Title                     object
Detail                    object
Date                      object
Price(in dollar)          object
Offer price(in dollar)    object
Review and rating         object
Number of bed             object
dtype: object

In [86]:
df_2.dtypes

Desert name         object
Date                object
Price(In dollar)    object
Details             object
Rating              object
dtype: object

In [87]:
df_3.dtypes

Luxe name           object
Date                object
Price(In dollar)    object
Distance            object
dtype: object

## Handeling Null Values

In [88]:
# create a function to find columns with nulls
def null_columns(dataframe):
    has_null_list = []
    for i in range(len(dataframe.columns)):
        is_null = bool(dataframe[dataframe.columns[i]].isnull().any()) # checking to see if column has any null values
        if is_null == True:
            column_name = dataframe.columns[i]
            has_null_list.append(column_name)
    return has_null_list

In [89]:
# Apply function and print out results
first_function = null_columns(df_1)
second_function = null_columns(df_2)
third_function = null_columns(df_3)
print(first_function)
print(second_function)
print(third_function)

['Offer price(in dollar)']
[]
[]


In [90]:
# Sizes of csv files
df1_shape = df_1.shape[0]
df2_shape = df_2.shape[0]

In [91]:
# Size of null values
null_shape_1 = df_1[df_1['Offer price(in dollar)'].isnull()].shape[0]
null_shape_2 = df_1[df_1['Review and rating'].isnull()].shape[0]
null_shape_2_1 = df_2[df_2['Rating'].isnull()].shape[0]

In [92]:
# Print Message
print(f" Null values in the first csv file are {round(null_shape_1/df1_shape,2)*100}% of the total file with null being in '{first_function[0]}' column")
# print(f" Null values in the first csv file are {round(null_shape_2/df2_shape,2)*100}% of the total file with null being in '{first_function[1]}' column")
# print(f" Null values in the second csv file are {round(null_shape_2_1/df2_shape,2)*100}% of the total file with null being in '{second_function[0]}' column")

 Null values in the first csv file are 83.0% of the total file with null being in 'Offer price(in dollar)' column


## Begin Cleaning First File

In [93]:
# Drop null values
df_clean = df_1.dropna(subset = ['Offer price(in dollar)']).copy()

In [94]:
df_clean['Offer price(in dollar)'] = df_clean['Offer price(in dollar)'].apply(lambda x: str(x).replace(",","")).astype(float)

In [95]:
df_clean['Price(in dollar)'] = df_clean['Price(in dollar)'].apply(lambda x: str(x).replace(",","")).astype(float)

In [96]:
df_clean['Title'] = df_clean['Title'].astype(str)

In [97]:
df_clean.head()

Unnamed: 0,Title,Detail,Date,Price(in dollar),Offer price(in dollar),Review and rating,Number of bed
0,"Chalet in Skykomish, Washington, US",Sky Haus - A-Frame Cabin,Jun 11 - 16,306.0,229.0,4.85 (531),4 beds
1,"Cabin in Hancock, New York, US",The Catskill A-Frame - Mid-Century Modern Cabin,Jun 6 - 11,485.0,170.0,4.77 (146),4 beds
2,"Cabin in West Farmington, Ohio, US",The Triangle: A-Frame Cabin for your city retreat,Jul 9 - 14,119.0,522.0,4.91 (515),4 beds
3,"Home in Blue Ridge, Georgia, US",*Summer Sizzle* 5 Min to Blue Ridge* Pets* Hot...,Jun 11 - 16,192.0,348.0,4.94 (88),5 beds
4,"Treehouse in Grandview, Texas, US",Luxury Treehouse Couples Getaway w/ Peaceful V...,Jun 4 - 9,232.0,196.0,4.99 (222),1 queen bed


In [98]:
df_clean.dtypes

Title                      object
Detail                     object
Date                       object
Price(in dollar)          float64
Offer price(in dollar)    float64
Review and rating          object
Number of bed              object
dtype: object

In [99]:
df_clean.groupby("Date")["Date"].count()

Date
Aug 1 - 6         1
Aug 14 - 19       1
Aug 18 - 23       1
Aug 19 - 24       1
Aug 20 - 25       3
                 ..
Sep 2 - 7         1
Sep 26 - Oct 1    1
Sep 29 - Oct 4    1
Sep 3 - 10        1
Sep 5 - 10        1
Name: Date, Length: 72, dtype: int64

In [100]:
df_clean[['Start Date','End Date']] = df_clean["Date"].str.split(' - ',expand = True)

In [101]:
# df_clean = df_clean.drop(['Date Range 1','Date Range 2'],axis =1)

In [102]:
df_clean

Unnamed: 0,Title,Detail,Date,Price(in dollar),Offer price(in dollar),Review and rating,Number of bed,Start Date,End Date
0,"Chalet in Skykomish, Washington, US",Sky Haus - A-Frame Cabin,Jun 11 - 16,306.0,229.0,4.85 (531),4 beds,Jun 11,16
1,"Cabin in Hancock, New York, US",The Catskill A-Frame - Mid-Century Modern Cabin,Jun 6 - 11,485.0,170.0,4.77 (146),4 beds,Jun 6,11
2,"Cabin in West Farmington, Ohio, US",The Triangle: A-Frame Cabin for your city retreat,Jul 9 - 14,119.0,522.0,4.91 (515),4 beds,Jul 9,14
3,"Home in Blue Ridge, Georgia, US",*Summer Sizzle* 5 Min to Blue Ridge* Pets* Hot...,Jun 11 - 16,192.0,348.0,4.94 (88),5 beds,Jun 11,16
4,"Treehouse in Grandview, Texas, US",Luxury Treehouse Couples Getaway w/ Peaceful V...,Jun 4 - 9,232.0,196.0,4.99 (222),1 queen bed,Jun 4,9
...,...,...,...,...,...,...,...,...,...
828,Apartment in Houston,King Suite/Sky Lounge/Luxurious Experience.,May 1 - 6,108.0,148.0,4.85 (55),2 beds,May 1,6
829,Cabin in Sevierville,Few days left in May,May 1 - 6,286.0,112.0,4.98 (263),1 king bed,May 1,6
830,Cabin in Fountain Green,"Cozy Private Mountain Cabin on 1,000 acre prop...",May 1 - 6,186.0,103.0,4.96 (78),4 beds,May 1,6
882,Dome in El Prado,Geodesic Earth Dome,May 1 - 6,91.0,161.0,4.9 (953),2 beds,May 1,6


In [103]:
df_clean["Start Month"] = df_clean["Start Date"].str.split(" ",expand = True)[0]

In [104]:
df_clean.groupby(["Start Month"]).count()

Unnamed: 0_level_0,Title,Detail,Date,Price(in dollar),Offer price(in dollar),Review and rating,Number of bed,Start Date,End Date
Start Month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Aug,12,12,12,12,12,12,12,12,12
Jan,1,1,1,1,1,1,1,1,1
Jul,25,25,25,25,25,25,25,25,25
Jun,101,101,101,101,101,101,101,101,101
May,12,12,12,12,12,12,12,12,12
Nov,1,1,1,1,1,1,1,1,1
Oct,4,4,4,4,4,4,4,4,4
Sep,10,10,10,10,10,10,10,10,10


In [105]:
# create month dictionary 
month_dict = {
    'Jan': 'January',
    'Feb': '',
    'Mar': '',
    'Apr': '',
    'May': 'May',
    'Jun': 'June',
    'Jul': 'July',
    'Aug': 'August',
    'Sep': 'September',
    'Oct': 'October',
    'Nov': 'November',
    'Dec': 'December',
}

In [106]:
df_clean["Start Month"] =df_clean["Start Month"].map(month_dict)

In [107]:
# df_clean = df_clean.drop(['Month'],axis =1)

In [108]:
df_clean["End Month"] = df_clean['End Date'].apply(lambda x: str(x).split(" ")[0] if str(x).split(" ")[0] in ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'] else '')

In [109]:
df_clean["End Month"] = df_clean["End Month"].map(month_dict)

In [110]:
df_clean['End Month'] = df_clean['End Month'].fillna(df_clean['Start Month'])

In [111]:
df_clean['Clean End Date'] = df_clean['End Date'].apply(lambda x: str(x).split(" ")[1] if str(x).split(" ")[0] in [['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']] else str(x).split(" ")[0])

In [112]:
df_clean['End Date'] = df_clean['Clean End Date']
df_clean['Start Date'] = df_clean['Start Date'].apply(lambda x: str(x).split(" ")[1])

In [113]:
df_clean = df_clean.drop(['Clean End Date'], axis = 1)

In [114]:
df_clean['Date Range'] = df_clean['Date']

In [115]:
df_clean.columns

Index(['Title', 'Detail', 'Date', 'Price(in dollar)', 'Offer price(in dollar)',
       'Review and rating', 'Number of bed', 'Start Date', 'End Date',
       'Start Month', 'End Month', 'Date Range'],
      dtype='object')

In [116]:
df_clean = df_clean.drop(['Date'], axis = 1)

In [117]:
df_clean.columns
new_column_order = ['Title', 'Detail', 'Date Range','Start Month','Start Date','End Month','End Date','Price(in dollar)', 'Offer price(in dollar)','Review and rating', 'Number of bed']

In [118]:
df_clean = df_clean[new_column_order]

In [119]:
df_clean.dtypes

Title                      object
Detail                     object
Date Range                 object
Start Month                object
Start Date                 object
End Month                  object
End Date                   object
Price(in dollar)          float64
Offer price(in dollar)    float64
Review and rating          object
Number of bed              object
dtype: object

## Begin Cleaning Second File

In [120]:
df_2.head()

Unnamed: 0,Desert name,Date,Price(In dollar),Details,Rating
0,"Mhamid, Morocco",May 1 – 29,479.0,Near Sahara Desert,4.79
1,"Aqaba City, Jordan",May 1 – 29,2168.0,Near Arabian Desert,4.92
2,"Tamesluht, Morocco",May 1 – 29,17752.0,"9,404 kilometers away",4.95
3,"Al Bairat, Egypt",May 1 – 29,1982.0,Near Sahara Desert,4.88
4,"Tamesluht, Morocco",May 1 – 29,17752.0,"9,404 kilometers away",4.87


In [121]:
df_2.dtypes

Desert name         object
Date                object
Price(In dollar)    object
Details             object
Rating              object
dtype: object

In [122]:
df_2['Date Range'] = df_2['Date']

In [123]:
df_2['Price(In dollar)'] = df_2['Price(In dollar)'].apply(lambda x: str(x).replace(',',''))

In [124]:
df_2['Price(In dollar)'] = df_2['Price(In dollar)'].astype(float)

In [125]:
df_2.dtypes

Desert name          object
Date                 object
Price(In dollar)    float64
Details              object
Rating               object
Date Range           object
dtype: object

In [126]:
df_2['Start Month'] = df_2['Date'].str.split(" ",expand = True)[0]
df_2['Start Date'] = df_2['Date'].str.split(" ",expand = True)[1]

In [127]:
df_2.head()

Unnamed: 0,Desert name,Date,Price(In dollar),Details,Rating,Date Range,Start Month,Start Date
0,"Mhamid, Morocco",May 1 – 29,479.0,Near Sahara Desert,4.79,May 1 – 29,May,1
1,"Aqaba City, Jordan",May 1 – 29,2168.0,Near Arabian Desert,4.92,May 1 – 29,May,1
2,"Tamesluht, Morocco",May 1 – 29,17752.0,"9,404 kilometers away",4.95,May 1 – 29,May,1
3,"Al Bairat, Egypt",May 1 – 29,1982.0,Near Sahara Desert,4.88,May 1 – 29,May,1
4,"Tamesluht, Morocco",May 1 – 29,17752.0,"9,404 kilometers away",4.87,May 1 – 29,May,1


In [128]:
df_2.groupby(['Date Range']).count()

Unnamed: 0_level_0,Desert name,Date,Price(In dollar),Details,Rating,Start Month,Start Date
Date Range,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
May 1 – 29,277,277,277,277,277,277,277
May 19 – Jun 16,1,1,1,1,1,1,1
May 5 – Jun 2,2,2,2,2,2,2,2


In [129]:
# df_2['End Date Range'] = 
df_2['End Date Range'] = df_2['Date'].str.split(" – ")

In [130]:
# df_2.head()
df_2[['Start Date','End Date']] = df_2["Date"].str.split(" – ",expand = True)

In [131]:
df_2['End Month'] = df_2['End Date'].apply(lambda x: str(x).split(" ")[0] if str(x).split(" ")[0] in ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'] else '')

In [132]:
df_2['End Month'] = df_2['End Month'].fillna(df_2['Start Month'])

In [133]:
df_2.head()

Unnamed: 0,Desert name,Date,Price(In dollar),Details,Rating,Date Range,Start Month,Start Date,End Date Range,End Date,End Month
0,"Mhamid, Morocco",May 1 – 29,479.0,Near Sahara Desert,4.79,May 1 – 29,May,May 1,"[May 1, 29]",29,
1,"Aqaba City, Jordan",May 1 – 29,2168.0,Near Arabian Desert,4.92,May 1 – 29,May,May 1,"[May 1, 29]",29,
2,"Tamesluht, Morocco",May 1 – 29,17752.0,"9,404 kilometers away",4.95,May 1 – 29,May,May 1,"[May 1, 29]",29,
3,"Al Bairat, Egypt",May 1 – 29,1982.0,Near Sahara Desert,4.88,May 1 – 29,May,May 1,"[May 1, 29]",29,
4,"Tamesluht, Morocco",May 1 – 29,17752.0,"9,404 kilometers away",4.87,May 1 – 29,May,May 1,"[May 1, 29]",29,


In [134]:
df_2 = df_2.drop(['End Date Range'], axis = 1)

In [136]:
df_2['End Month'] = df_2['End Month'].fillna(df_clean['Start Month'])

In [147]:
# df_2.groupby(['End Month']).count().reset_index()[['End Month']].dtypes
df_2.groupby(['End Month']).count().reset_index()

Unnamed: 0,End Month,Desert name,Date,Price(In dollar),Details,Rating,Date Range,Start Month,Start Date,End Date
0,,277,277,277,277,277,277,277,277,277
1,Jun,3,3,3,3,3,3,3,3,3


In [149]:
df_2['End Month'] = df_2['End Month'].map(month_dict)

In [150]:
df_2.head()

Unnamed: 0,Desert name,Date,Price(In dollar),Details,Rating,Date Range,Start Month,Start Date,End Date,End Month
0,"Mhamid, Morocco",May 1 – 29,479.0,Near Sahara Desert,4.79,May 1 – 29,May,May 1,29,
1,"Aqaba City, Jordan",May 1 – 29,2168.0,Near Arabian Desert,4.92,May 1 – 29,May,May 1,29,
2,"Tamesluht, Morocco",May 1 – 29,17752.0,"9,404 kilometers away",4.95,May 1 – 29,May,May 1,29,
3,"Al Bairat, Egypt",May 1 – 29,1982.0,Near Sahara Desert,4.88,May 1 – 29,May,May 1,29,
4,"Tamesluht, Morocco",May 1 – 29,17752.0,"9,404 kilometers away",4.87,May 1 – 29,May,May 1,29,


In [151]:
df_2['End Month'] = df_2['End Month'].fillna(df_2['Start Month'])

In [153]:
df_2 = df_2.drop(['Date'],axis = 1)

In [155]:
df_2.dtypes

Desert name          object
Price(In dollar)    float64
Details              object
Rating               object
Date Range           object
Start Month          object
Start Date           object
End Date             object
End Month            object
dtype: object

In [160]:
df_2.groupby(['Rating']).count().reset_index()[['Rating']]

Unnamed: 0,Rating
0,3.5
1,3.67
2,4.0
3,4.11
4,4.15
5,4.17
6,4.2
7,4.23
8,4.25
9,4.3


In [162]:
df_2_replaced = df_2.replace({None: np.nan})

In [163]:
df_2_replaced.head()

Unnamed: 0,Desert name,Price(In dollar),Details,Rating,Date Range,Start Month,Start Date,End Date,End Month
0,"Mhamid, Morocco",479.0,Near Sahara Desert,4.79,May 1 – 29,May,May 1,29,May
1,"Aqaba City, Jordan",2168.0,Near Arabian Desert,4.92,May 1 – 29,May,May 1,29,May
2,"Tamesluht, Morocco",17752.0,"9,404 kilometers away",4.95,May 1 – 29,May,May 1,29,May
3,"Al Bairat, Egypt",1982.0,Near Sahara Desert,4.88,May 1 – 29,May,May 1,29,May
4,"Tamesluht, Morocco",17752.0,"9,404 kilometers away",4.87,May 1 – 29,May,May 1,29,May


In [182]:
# df_2_replaced['Rating'] = df_2_replaced['Rating'].astype(float)

In [175]:
df_2_replaced['Rating'] = df_2_replaced['Rating'].apply(lambda x: np.nan if str(x) == 'None' else x)

In [178]:
df_2_replaced['Rating'] = df_2_replaced['Rating'].astype(float)

In [180]:
df_2_replaced.dtypes

Desert name          object
Price(In dollar)    float64
Details              object
Rating              float64
Date Range           object
Start Month          object
Start Date           object
End Date             object
End Month            object
dtype: object

In [185]:
df_2_replaced['Start Date'] = df_2_replaced['Start Date'].str.split(' ' ,expand = True)[1].head()

In [187]:
df_2_replaced.columns

Index(['Desert name', 'Price(In dollar)', 'Details', 'Rating', 'Date Range',
       'Start Month', 'Start Date', 'End Date', 'End Month'],
      dtype='object')

In [188]:
column_order = ['Desert name','Details','Rating','Price(In dollar)','Date Range','Start Month','Start Date','End Month','End Date']

In [190]:
df_2 = df_2_replaced[column_order]

In [195]:
df_2[df_2.duplicated()]

Unnamed: 0,Desert name,Details,Rating,Price(In dollar),Date Range,Start Month,Start Date,End Month,End Date
184,"Essaouira, Morocco","9,536 kilometers away",,5386.0,May 1 – 29,May,,May,29
200,"Essaouira, Morocco","9,536 kilometers away",,5386.0,May 1 – 29,May,,May,29
217,"Essaouira, Morocco","9,536 kilometers away",,5386.0,May 1 – 29,May,,May,29
266,"Tarajalejo, Spain","10,083 kilometers away",,1898.0,May 1 – 29,May,,May,29


In [207]:
df_2 = df_2.drop_duplicates()
    # subset = None, keep = 'first', inplace = True, ignore_index = False)

In [210]:
df_2

Unnamed: 0,Desert name,Details,Rating,Price(In dollar),Date Range,Start Month,Start Date,End Month,End Date
0,"Mhamid, Morocco",Near Sahara Desert,4.79,479.0,May 1 – 29,May,1,May,29
1,"Aqaba City, Jordan",Near Arabian Desert,4.92,2168.0,May 1 – 29,May,1,May,29
2,"Tamesluht, Morocco","9,404 kilometers away",4.95,17752.0,May 1 – 29,May,1,May,29
3,"Al Bairat, Egypt",Near Sahara Desert,4.88,1982.0,May 1 – 29,May,1,May,29
4,"Tamesluht, Morocco","9,404 kilometers away",4.87,17752.0,May 1 – 29,May,1,May,29
...,...,...,...,...,...,...,...,...,...
275,"Haría Lanzarote, Spain","9,981 kilometers away",,5117.0,May 1 – 29,May,,May,29
276,"Tefia , Spain","10,058 kilometers away",,2326.0,May 1 – 29,May,,May,29
277,"El Pozo de los Frailes, Spain","8,668 kilometers away",,2994.0,May 1 – 29,May,,May,29
278,"Tarajalejo, Spain","10,083 kilometers away",,1616.0,May 1 – 29,May,,May,29


## Begin Cleaning 3rd File

In [249]:
df_3.head()

Unnamed: 0,Luxe name,Date,Price(In dollar),Distance
0,"Koh Samui, Thailand",May 1 – 29,89600.0,"1,880 kilometers away"
1,"Koh Samui, Thailand",May 1 – 29,78459.0,"1,880 kilometers away"
2,"Koh Samui, Thailand",May 1 – 29,53200.0,"1,881 kilometers away"
3,"Koh Samui, Thailand",May 1 – 29,35000.0,"1,880 kilometers away"
4,"Nathon, Thailand",May 1 – 29,19656.0,"1,872 kilometers away"


In [250]:
df_3.duplicated().sum()

17

In [251]:
df_3[df_3.duplicated()].sort_values(by = 'Luxe name')

Unnamed: 0,Luxe name,Date,Price(In dollar),Distance
16,"Koh Samui, Thailand",May 1 – 29,42199.0,"1,880 kilometers away"
49,"Koh Samui, Thailand",May 1 – 29,25970.0,"1,880 kilometers away"
48,"Koh Samui, Thailand",May 1 – 29,25970.0,"1,880 kilometers away"
47,"Koh Samui, Thailand",May 1 – 29,25970.0,"1,880 kilometers away"
45,"Koh Samui, Thailand",May 1 – 29,36848.0,"1,880 kilometers away"
41,"Koh Samui, Thailand",May 1 – 29,25970.0,"1,880 kilometers away"
46,"Koh Samui, Thailand",May 1 – 29,25970.0,"1,880 kilometers away"
34,"Koh Samui, Thailand",May 1 – 29,42199.0,"1,880 kilometers away"
29,"Koh Samui, Thailand",May 1 – 29,42199.0,"1,880 kilometers away"
27,"Koh Samui, Thailand",May 1 – 29,49804.0,"1,880 kilometers away"


In [252]:
df_3_no_dupes = df_3.drop_duplicates()

In [253]:
df_3_no_dupes['Start Date'] = df_3_no_dupes['Date'].str.split(' – ', expand = True)[0]
df_3_no_dupes['End Date'] = df_3_no_dupes['Date'].str.split(' – ', expand = True)[1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_3_no_dupes['Start Date'] = df_3_no_dupes['Date'].str.split(' – ', expand = True)[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_3_no_dupes['End Date'] = df_3_no_dupes['Date'].str.split(' – ', expand = True)[1]


In [254]:
df_3_no_dupes['Start Month'] = df_3_no_dupes['Start Date'].str.split(' ', expand = True)[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_3_no_dupes['Start Month'] = df_3_no_dupes['Start Date'].str.split(' ', expand = True)[0]


In [255]:
df_3_no_dupes

Unnamed: 0,Luxe name,Date,Price(In dollar),Distance,Start Date,End Date,Start Month
0,"Koh Samui, Thailand",May 1 – 29,89600.00,"1,880 kilometers away",May 1,29,May
1,"Koh Samui, Thailand",May 1 – 29,78459.00,"1,880 kilometers away",May 1,29,May
2,"Koh Samui, Thailand",May 1 – 29,53200.00,"1,881 kilometers away",May 1,29,May
3,"Koh Samui, Thailand",May 1 – 29,35000.00,"1,880 kilometers away",May 1,29,May
4,"Nathon, Thailand",May 1 – 29,19656.00,"1,872 kilometers away",May 1,29,May
...,...,...,...,...,...,...,...
275,"Doni Štoj, Montenegro",May 1 – 29,25535.00,"6,750 kilometers away",May 1,29,May
276,"Unawatuna, Sri Lanka",May 1 – 29,50400.00,"2,249 kilometers away",May 1,29,May
277,"La Place Belgath, Mauritius",May 1 – 29,55577.00,"6,035 kilometers away",May 1,29,May
278,"Castelfiorentino, Italy",May 4 – Jun 1,46594.00,"7,390 kilometers away",May 4,Jun 1,May


In [256]:
df_3_no_dupes['End Month'] = df_3_no_dupes['End Date'].apply(lambda x: str(x).split(" ")[0] if str(x).split(" ")[0] in ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'] else '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_3_no_dupes['End Month'] = df_3_no_dupes['End Date'].apply(lambda x: str(x).split(" ")[0] if str(x).split(" ")[0] in ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'] else '')


In [257]:
df_3_no_dupes['End Month'] = df_3_no_dupes['End Month'].map(month_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_3_no_dupes['End Month'] = df_3_no_dupes['End Month'].map(month_dict)


In [258]:
df_3_no_dupes['End Month'] = df_3_no_dupes['End Month'].fillna(df_3_no_dupes['Start Month'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_3_no_dupes['End Month'] = df_3_no_dupes['End Month'].fillna(df_3_no_dupes['Start Month'])


In [259]:
df_3_no_dupes['Start Date'] = df_3_no_dupes['Start Date'].str.split(' ', expand = True)[1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_3_no_dupes['Start Date'] = df_3_no_dupes['Start Date'].str.split(' ', expand = True)[1]


In [260]:
df_3_no_dupes['Price(In dollar)'] = df_3_no_dupes['Price(In dollar)'].str.replace(',','')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_3_no_dupes['Price(In dollar)'] = df_3_no_dupes['Price(In dollar)'].str.replace(',','')


In [261]:
df_3_no_dupes['Price(In dollar)'] = df_3_no_dupes['Price(In dollar)'].astype(float)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_3_no_dupes['Price(In dollar)'] = df_3_no_dupes['Price(In dollar)'].astype(float)


In [262]:
df_3_no_dupes.head()

Unnamed: 0,Luxe name,Date,Price(In dollar),Distance,Start Date,End Date,Start Month,End Month
0,"Koh Samui, Thailand",May 1 – 29,89600.0,"1,880 kilometers away",1,29,May,May
1,"Koh Samui, Thailand",May 1 – 29,78459.0,"1,880 kilometers away",1,29,May,May
2,"Koh Samui, Thailand",May 1 – 29,53200.0,"1,881 kilometers away",1,29,May,May
3,"Koh Samui, Thailand",May 1 – 29,35000.0,"1,880 kilometers away",1,29,May,May
4,"Nathon, Thailand",May 1 – 29,19656.0,"1,872 kilometers away",1,29,May,May


In [263]:
df_3_no_dupes['Start Date'] = df_3_no_dupes['Start Date'].astype(int)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_3_no_dupes['Start Date'] = df_3_no_dupes['Start Date'].astype(int)


In [265]:
df_3_no_dupes.groupby('End Date').count()

Unnamed: 0_level_0,Luxe name,Date,Price(In dollar),Distance,Start Date,Start Month,End Month
End Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
29,212,212,212,212,212,212,212
30,1,1,1,1,1,1,1
Jun 1,35,35,35,35,35,35,35
Jun 13,1,1,1,1,1,1,1
Jun 17,1,1,1,1,1,1,1
Jun 2,8,8,8,8,8,8,8
Jun 21,1,1,1,1,1,1,1
Jun 3,2,2,2,2,2,2,2
Jun 8,1,1,1,1,1,1,1
Jun 9,1,1,1,1,1,1,1


In [266]:
df_3_no_dupes['End Date'] = df_3_no_dupes['End Date'].apply(lambda x: str(x).split(" ")[1] if str(x).split(" ")[0] in ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'] else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_3_no_dupes['End Date'] = df_3_no_dupes['End Date'].apply(lambda x: str(x).split(" ")[1] if str(x).split(" ")[0] in ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'] else x)


In [267]:
df_3_no_dupes.head()

Unnamed: 0,Luxe name,Date,Price(In dollar),Distance,Start Date,End Date,Start Month,End Month
0,"Koh Samui, Thailand",May 1 – 29,89600.0,"1,880 kilometers away",1,29,May,May
1,"Koh Samui, Thailand",May 1 – 29,78459.0,"1,880 kilometers away",1,29,May,May
2,"Koh Samui, Thailand",May 1 – 29,53200.0,"1,881 kilometers away",1,29,May,May
3,"Koh Samui, Thailand",May 1 – 29,35000.0,"1,880 kilometers away",1,29,May,May
4,"Nathon, Thailand",May 1 – 29,19656.0,"1,872 kilometers away",1,29,May,May


In [270]:
df_3_no_dupes['End Date'] = df_3_no_dupes['End Date'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_3_no_dupes['End Date'] = df_3_no_dupes['End Date'].astype(int)


In [271]:
df_3_no_dupes.dtypes

Luxe name            object
Date                 object
Price(In dollar)    float64
Distance             object
Start Date            int64
End Date              int64
Start Month          object
End Month            object
dtype: object

In [273]:
df_3_renamed = df_3_no_dupes.rename(columns = {'Date': 'Date Range','Price(In dollar)': 'Price (In Dollars)'})

In [275]:
df_3_renamed.columns

Index(['Luxe name', 'Date Range', 'Price (In Dollars)', 'Distance',
       'Start Date', 'End Date', 'Start Month', 'End Month'],
      dtype='object')

In [281]:
columns = ['Luxe name', 'Date Range', 'Start Month','Start Date','End Month','End Date','Price (In Dollars)', 'Distance',]

In [282]:
df_3_clean = df_3_renamed[columns]

In [283]:
df_3_clean.head()

Unnamed: 0,Luxe name,Date Range,Start Month,Start Date,End Month,End Date,Price (In Dollars),Distance
0,"Koh Samui, Thailand",May 1 – 29,May,1,May,29,89600.0,"1,880 kilometers away"
1,"Koh Samui, Thailand",May 1 – 29,May,1,May,29,78459.0,"1,880 kilometers away"
2,"Koh Samui, Thailand",May 1 – 29,May,1,May,29,53200.0,"1,881 kilometers away"
3,"Koh Samui, Thailand",May 1 – 29,May,1,May,29,35000.0,"1,880 kilometers away"
4,"Nathon, Thailand",May 1 – 29,May,1,May,29,19656.0,"1,872 kilometers away"


## Export To New CSV Files

In [None]:
# df_1 = pd.read_csv('../data/airnb.csv')
# df_2 = pd.read_csv('../data/airnb_desert.csv', encoding="cp1252")
# df_3 = pd.read_csv('../data/airnb_luxe.csv', encoding="cp1252")

In [284]:
df_3_clean.to_csv('../data/airnb_luxe_clean.csv')

In [285]:
df_2.to_csv('../data/airnb_desert_clean.csv')

In [286]:
df_clean.to_csv('../data/airnb_clean.csv')