# This notebook will serve for the analysis of the Pictures and Statistics for the TFW project

In [None]:
# Import necassary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the datasets
df_statistics = pd.read_csv('../data/statistics_20210707.csv')
df_pictures = pd.read_csv('../data/pictures_dataset.csv')

In [None]:
# First look datset statistics
df_statistics.head(2)

In [None]:
# First look datset pictures
df_pictures.head(2)

In [None]:
# Shape of both datasets
print(df_pictures.shape)
print(df_statistics.shape)

In [None]:
# Info dataset statistics
df_statistics.info()

In [None]:
# Info dataset pictures
df_pictures.info()

## Remove unnamed column in df pictures

In [None]:
# Drop unnamed column
df_pictures = df_pictures.drop('Unnamed: 0', axis=1)

## Covert date features into datetime

In [None]:
# Convert to datetime
df_statistics['yearmonth'] = pd.to_datetime(df_statistics['yearmonth']).dt.strftime('%Y-%m')
df_pictures['added'] = pd.to_datetime(df_pictures['added']).dt.date
df_pictures['activated'] = pd.to_datetime(df_pictures['activated']).dt.date
df_pictures['yearmonth'] = pd.to_datetime(df_pictures['activated']).dt.strftime('%Y-%m')

Compare the listing ID to see if all properties are included or how much are missing.

In [None]:
# Number of unique listings ID
print(df_statistics.listing_id.nunique())
print(df_pictures.listing_id.nunique())

# Houses without pictures
print(df_statistics.listing_id.nunique() - df_pictures.listing_id.nunique(), 'properties have no pictures.')



47 properties have no pictures.

### Create dataframe for listing ID with missing pictures

In [None]:
# New columns with missing listing ID in the dataset pictures
df_statistics['in_df_picture'] = df_statistics['listing_id'].isin(df_pictures['listing_id'])

In [None]:
# Groupby the new column by value False and listing ID
missing_pictures = df_statistics.query('in_df_picture == False').groupby('listing_id').count().reset_index()

### Look if the listing ID with missing pictures are online or offline. For this we import the dataset listings.

In [None]:
# Load dataset listings
df_listings = pd.read_csv('../data/master_listings_20210715.csv')

In [None]:
# Merging datasets
df_compare = pd.merge(missing_pictures, df_listings, on='listing_id', how='left')

In [None]:
# Looking for the state of the listing ID with missing pictures
print(df_compare.state.unique())
print(df_compare.state.value_counts())

Expect of one listing ID all properties with missing pictures are offline, Deactivated or not known in the listings.

In [None]:
# Looking for the one ID with state 'Online'
df_compare[df_compare['state'] == 'Online']

In [None]:
# Looking if the one ID have inquries
df_statistics[df_statistics['listing_id']== 'b25773bf-192e-5058-a6fc-91844396e7e1'].inquiry_count.sum()

The one property with state 'Online' and no pictures have no inquiries too. The other 46 properties with missing pictures are not 'online'.

### Groupby the dataset statistics per month.

In [None]:
# Using groupby to summarize the statistics per yearmonth
statistic_per_month = df_statistics.groupby('yearmonth').sum().reset_index()

## Distributions of the features in Pictures

First, lets have a look on the picture resolution.

In [None]:
df_pictures.picture_resolution.hist()

Most of the pictures have a ultra-high-definition.

In [None]:
# Calculate amount of picture resolution values
print('Percent of picture resolution values')
print('Poor resolution:', round(df_pictures[df_pictures['picture_resolution'] == 'Poor Resolution'].count()[0]/df_pictures.shape[0]*100, 1))
print('High definition:', round(df_pictures[df_pictures['picture_resolution'] == 'High-definition'].count()[0]/df_pictures.shape[0]*100, 1))
print('Ultra high definition:', round(df_pictures[df_pictures['picture_resolution'] == 'Ultra-high-definition'].count()[0]/df_pictures.shape[0]*100, 1))

85.7 % of the pictures have the resolution 'ultra high definition', 13.0 % 'high definition', and only 1.3 % 'poor resolution'. So nearly 98% have a resolution 'high definition or better'.

### Lets have a look at the motives and motives by resolution.

In [None]:
# Histogram description
fig, ax = plt.subplots(figsize=(15,5))
sns.histplot(data=df_pictures, x="description");


In [None]:
# Histogram description by picture resolution
fig, ax = plt.subplots(figsize=(15,5))
sns.histplot(data=df_pictures, x="description", hue='picture_resolution', multiple="dodge", shrink=.8);

Create a new column with the difference between the added and activated date for the picture to plot this.

In [None]:
# Create new column
df_pictures['diff_add_activate'] = df_pictures.activated - df_pictures.added

In [None]:
# Description of new column diff_add_activate
df_pictures.diff_add_activate.describe()

The range of the new column diff_add_activate is wide. The minimum is a negative value. 

In [None]:
# Plotting the distribution after converting new column from timedelta to days
df_pictures['diff_add_activate'] = df_pictures['diff_add_activate'].dt.days
df_pictures.boxplot(column=['diff_add_activate']);

Most of the pictures have no difference between the dates added and activated.

Which motives have the high difference between added and activated date?

In [None]:
# 95 % percentile as threshold
df_pictures.diff_add_activate.quantile(0.95)

In [None]:
# Filter and groupby with 95 % percentile
df_pictures.query('diff_add_activate > 3').groupby('description')['diff_add_activate'].count()

How many pictures have each unique property?

In [None]:
# Using groupby to summarize the statistics per yearmonth
picture_per_id = df_pictures.groupby('listing_id')['description'].count().reset_index()

In [None]:
# Description
picture_per_id.describe()

In [None]:
# Histogram of distribution
picture_per_id.hist(bins=80)

In [None]:
# Boxplot of distribution
picture_per_id.boxplot(column=['description'])

In [None]:
picture_per_id.columns

In [None]:
# Histogram description
fig, ax = plt.subplots(figsize=(15,5))
sns.histplot(data=df_pictures, x="activated");

## Merging both datasets

First, we encoding the feature picture resolution.

In [None]:
# Get one hot encoding of columns 'picture_resolution'
one_hot = pd.get_dummies(df_pictures['picture_resolution'])
# Drop column as it is now encoded
df_pictures = df_pictures.drop('picture_resolution',axis = 1)
# Join the encoded df
df_pictures = df_pictures.join(one_hot)

Second, we encoding the feature description.

In [None]:
# Get one hot encoding of columns 'description'
one_hot_2 = pd.get_dummies(df_pictures['description'])
# Drop column as it is now encoded
df_pictures = df_pictures.drop('description',axis = 1)
# Join the encoded df
df_pictures = df_pictures.join(one_hot_2)

In [None]:
# New shape of the dataset pictures
df_pictures.shape

Third, we group both datasets bei listing ID and yearmonth for the merging

In [None]:
# Groupby df_statistics and df_pictures
stats = df_statistics.groupby(['listing_id', 'yearmonth']).sum().reset_index()
pics = df_pictures.groupby(['listing_id', 'yearmonth']).sum().reset_index()

In [None]:
# Merging both new grouped datesets
pic_stat = pd.merge(stats, pics, on=['listing_id', 'yearmonth'], how='outer')

In [None]:
pic_stat.shape

### Calculate inquiry count, result views, ewpose views and phone clicks before and after activate pictures

First create a new column if a picture was activated or not

In [None]:
# Function to build categories
def categorizer(a, b, c):
    if np.isnan(a):
        if np.isnan(b):
            if np.isnan(c):
                return 0
    else:
        return 1

In [None]:
# Create new column
pic_stat['picture']= pic_stat.apply(lambda x: categorizer(
    x['High-definition'], x['Poor Resolution'], x['Ultra-high-definition']), axis=1)

In [None]:
pic_stat.shape

In [None]:
# Only years 2019 and 2020
start_date = "2019-01"
end_date = "2020-12"

after_start_date = pic_stat["yearmonth"] >= start_date
before_end_date = pic_stat["yearmonth"] <= end_date
between_two_dates = after_start_date & before_end_date
pic_stat_2019_2020 = pic_stat.loc[between_two_dates]

In [None]:
pic_stat_2019_2020.shape

In [None]:
test = pic_stat_2019_2020[pic_stat_2019_2020['listing_id'] == '0003f153-450a-5837-8a46-bd5db5b536b1']

In [None]:
test

In [None]:
idx = np.where(test["picture"] == 1)

In [None]:
for indices in idx:
    before = []
    after = []
    dates = test.iloc[indices]
    print(dates['yearmonth'])
    for date in dates['yearmonth']:
        before.append(test[test['yearmonth'] <= date]['result_views'].sum())
        after.append(test[test['yearmonth'] > date]['result_views'].sum())
print(before, after)