# Assignment 1


In [None]:
# Imports
from pathlib import Path
import pandas as pd
import tarfile
import urllib.request
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import numpy as np
import seaborn as sns
import scipy.stats as stats
from sklearn.neighbors import KNeighborsClassifier
import matplotlib
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split

# Quick look at the data

In [None]:
df = pd.read_csv('listings.csv')

In [None]:
df.info()

# 1
Removal of columns

In [None]:
# lets see the columns
col_vals = df.columns.values

# now remove the columns
cols_to_have = {
    'id',
    'name',
    'host_id',
    'host_name',
    'neighbourhood_cleansed',
    'latitude',
    'longitude',
    'room_type',
    'price',
    'minimum_nights',
    'number_of_reviews',
    'last_review',
    'review_scores_rating',
    'review_scores_accuracy',
    'review_scores_cleanliness',
    'review_scores_checkin',
    'review_scores_communication',
    'review_scores_location',
    'review_scores_value',
    'reviews_per_month',
    'calculated_host_listings_count',
    'availability_365'
}

col_vals_set = {val for val in col_vals}

col_to_remove = col_vals_set.difference(cols_to_have)

df.drop(col_to_remove, axis=1)

# and lets check it out
df = df.drop(list(col_to_remove), axis=1)

df.columns.values

df.describe()

# 2

Next we have to handle missing values. Remove all rows where `number_of_reviews = 0`. If there are still missing values, remove the rows that contain them so you have a data set with no missing values

In [None]:
df = df[df['number_of_reviews']!=0]
df = df.dropna()
display(df.describe())
display(df.head(20))
display(df['neighbourhood_cleansed'].unique())

# 3

Fix the `neighbourhood_cleansed` values (some are missing 'æ ø å'), and if necessary change the price to DKK.

In [None]:
exchange_rate = 6.96 # current exchange rate

df['price_DKK'] = 0

def usd_to_dkk(price_usd):
    price_usd = float(price_usd.strip('$').replace(',', ''))
    return price_usd * exchange_rate

for index, row in df.iterrows():
    price_usd = row['price']
    price_dkk = usd_to_dkk(price_usd)
    df.at[index, 'price_DKK'] = price_dkk

df['price_DKK'] = df['price'].apply(usd_to_dkk)
df.head()

In [None]:
df['neighbourhood_cleansed'] = df['neighbourhood_cleansed'].replace('Nrrebro','Nørrebro')
df['neighbourhood_cleansed'] = df['neighbourhood_cleansed'].replace('sterbro','Østerbro')
df['neighbourhood_cleansed'] = df['neighbourhood_cleansed'].replace('Amager st','Amager Øst')
df['neighbourhood_cleansed'] = df['neighbourhood_cleansed'].replace('Brnshj-Husum','Brønshøj-Husum')
df['neighbourhood_cleansed'] = df['neighbourhood_cleansed'].replace('Vanlse','Vanløse')

df.head()

# 4

Create a fitting word cloud based on the `name` column. Feel free to remove non-descriptive stop words (e.g. since this is about Copenhagen, perhaps the word 'Copenhagen' is redundant).

In [None]:
wordcloud = WordCloud(width=800, height=400, background_color='white', stopwords=['Copenhagen']).generate(str(df['name'].values))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

# 5

Since data science is so much fun, provide a word cloud of the names of the hosts, removing any names of non-persons. Does this more or less correspond with the distribution of names according to [Danmarks Statistik](https://www.dst.dk/da/Statistik/emner/borgere/navne/navne-i-hele-befolkningen)?

In [None]:
wc = WordCloud(width=800, height=400, background_color='white').generate(str(df['host_name'].values))
plt.imshow(wc)
plt.axis('off')
plt.show()

# 6

Create a new column using bins of price. Use 11 bins, evenly distributed but with the last bin $> 10,000$.

In [None]:
bins = np.linspace(0, 10000*exchange_rate, 11)

## 7 

Using non-scaled versions of latitude and longitude, plot the listings data on a map. Use the newly created price bins as a color parameter. Also, create a plot (i.e. another plot) where you group the listings with regard to the neighbourhood.

In [None]:
df.plot(kind="scatter", x="longitude", y="latitude", grid=True)
plt.show()

In [None]:
which_bin = np.digitize(df.loc[:,"price_DKK"], bins=bins).reshape(-1,1)
which_bin_df = pd.DataFrame(which_bin, columns=['price_bin_interval'])
df = df.assign(price_bin=which_bin)

df['price_bin_interval'] = pd.cut(df['price_DKK'], bins)

display(df.head())
norm = matplotlib.colors.Normalize(vmin=min(bins), vmax=max(bins), clip=True)

colormap = plt.cm.get_cmap('cool', len(bins)-1)
df[which_bin_df.isnull().values]
plt.figure(figsize=(10, 6))
plt.scatter(df['longitude'], df['latitude'], c= df['price_bin_interval'].cat.codes, cmap=colormap)
plt.colorbar(label='Price Range')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()

In [None]:
neighborhood_counts = df['neighbourhood_cleansed'].value_counts()
neighborhood_counts.plot(kind='bar', figsize=(10, 6))
plt.xlabel('Neighborhood')
plt.ylabel('Number of listings')
plt.title('Number of listings based on neighborhood')
plt.xticks(rotation=90)
plt.show()

# 8

Create boxplots where you have the neighbourhood on the x-axis and price on the y-axis. What does this tell you about the listings in Copenhagen? Keep the x-axis as is and move different variables into the y-axis to see how things are distributed between the neighborhoods to create different plots (your choice).

In [None]:
dfn = df.groupby('neighbourhood_cleansed')
ax = dfn.boxplot(column='price_DKK',layout=(1,11), figsize=(18,7),whis=3,showfliers=False)
plt.tight_layout()


# 9

Create a bar chart of the hosts with the top ten most listings. Place host id on the x-axis and the count of listings on the y-axis.

In [None]:
dfh = df.groupby(['host_id']).size().sort_values(ascending=False)[:10]
display(df)
sns.barplot(x=dfh.index,y=dfh.values)
plt.xticks(rotation=70)
plt.tight_layout()

# 10

Do a descriptive analysis of the neighborhoods. Include information about room type in the analysis as well as one other self-chosen feature. The descriptive analysis should contain mean/average, mode, median, standard deviation/variance, minimum, maximum and quartiles.
</br>


In [None]:
filtered_df = df[['id','room_type', 'neighbourhood_cleansed', 'review_scores_rating']]
grouped_data = filtered_df.groupby(['neighbourhood_cleansed', 'room_type']).describe()
display(grouped_data['review_scores_rating'])

mean_ratings = grouped_data['review_scores_rating']['mean'].reset_index()

x_labels = mean_ratings.apply(lambda row: f"{row['neighbourhood_cleansed']} - {row['room_type']}", axis=1)

plt.bar(x_labels,mean_ratings['mean'])
plt.xlabel('Neighborhood - Room Type')
plt.ylabel('Mean review score ratings')
plt.title('Mean Review Scores Rating by Neighbourhood and Room Type')
plt.xticks(rotation=90)

plt.show()

# 11

Supply a list of the top 10 highest rated listings and visualize them on a map.

In [None]:
import folium

df_sorted = df.sort_values(by='review_scores_rating', ascending=False)
top_10_listings = df_sorted.head(10)

top_10_listings_filtered = top_10_listings[['id', 'name','review_scores_rating', 'latitude', 'longitude']]

display(top_10_listings_filtered)

top_10_listings_map = folium.Map(location=[top_10_listings_filtered.iloc[0, 3], top_10_listings_filtered.iloc[0, 4]], zoom_start=10)

# Add markers for the top 10 listings
for index, row in top_10_listings_filtered.iterrows():
    latitude = row['latitude']
    longitude = row['longitude']
    name = row['name']
    folium.Marker(
            location=[df.at[index, 'latitude'], df.at[index, 'longitude']],
            popup=f"{df.at[index, 'name']} - Rating: {df.at[index, 'review_scores_rating']}",
        ).add_to(top_10_listings_map)

display(top_10_listings_map)

# 12
Now, use any preprocessing and feature engineering steps that you find relevant before proceeding (optional).

In [None]:
df.describe()

features = df.loc[:,'room_type']
features2 = df.loc[:,'neighbourhood_cleansed']

df1 = df.drop(['host_name','neighbourhood_cleansed', 'name', 'price', 'room_type', 'last_review','id'
               ,'review_scores_accuracy','latitude', 'longitude', 'review_scores_cleanliness','review_scores_checkin','review_scores_communication',
               'review_scores_location','review_scores_value'], axis=1)

display(df1.head())

df1 = df1.join(pd.get_dummies(features))
df1 = df1.join(pd.get_dummies(features2))

df1.head()

In [None]:
display(df1.describe())

# 13

Create another new column, where the price is divided into two categories: "expensive" listings defined by all listings with a price higher than the median price, and "affordable" listings defined by all listings with a price equal to or below the median price. You can encode the affordable listings as "0" and the expensive ones as "1". All listings should now have a classification indicating either expensive listings (1) or affordable listings (0).


In [None]:
median = df1['price_DKK'].median()
std = df1['price_DKK'].std()
df1.loc[(df1['price_DKK'] - median).abs() > std,'price_DKK'] = np.nan
df1['price_DKK'].fillna(median, inplace=True)


df1['price_classification'] = 0
df1.loc[df1['price_DKK'] > median, 'price_classification'] = 1
df1.head()
df1.describe()

df1 = df1[df1['number_of_reviews'] > 150]
df1 = df1[df1['availability_365'] > 5]
df1 = df1[df1['review_scores_rating'] > 0]

df1.head()

# 14

Based on self-chosen features, develop a Naïve Bayes and k-Nearest Neighbor model to determine whether a rental property should be classified as 0 or 1. Remember to divide your data into training data and test data. Comment on your findings.

In [None]:
nb = GaussianNB()

y = df1['price_classification']
df2 = df1.drop(['price_classification', 'price_bin_interval', 'price_bin'], axis=1)

X_train, X_test, Y_train, Y_test = train_test_split(df2,y, random_state=10)

# X_train.info()
nb.fit(X_train, Y_train)

nb.score(X_test,Y_test)

In [None]:
knn = KNeighborsClassifier(n_neighbors=15)
knn.fit(X_train,Y_train)

print(knn.score(X_test.values, Y_test.values))

In [None]:
df1.shape[0]

# 15
Try to come up with a final conclusion to the Airbnb-Copenhagen assignment.


It has to be concluded that Naive Bayes is a better fit for this data set with a high level of accuracy for the Airbnb-Copenhagen data. This is due to the fact that this model is data-driven which meant that we had to clean the data throughly by removing noise and outliers before proceeding in order to get a high accuracy. 

As for K-Nearest Neighbour, on the other hand, it did not have such a high score as NB. Therefore, this means that this solution is not as good. Regarding the complexity of the 2 algorithms, NB is much less complex than K-Nearest Neighbor which means that it can be trained to be used more quickly.
