# This notebook will serve for the analysis of the Pictures and Statistics for the TFW project

In [None]:
# Import necassary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the datasets
df_statistics = pd.read_csv('../data/statistics_20210707.csv')
df_pictures = pd.read_csv('../data/pictures_dataset.csv')

In [None]:
df_statistics.head(2)

In [None]:
df_pictures.head(2)

In [None]:
print(df_pictures.shape)
print(df_statistics.shape)

In [None]:
df_pictures.info()

In [None]:
df_statistics.info()

## Remove unnamed column in df pictures

In [None]:
# Drop unnamed column
df_pictures = df_pictures.drop('Unnamed: 0', axis=1)

## Covert date features into datetime

In [None]:
# Convert to datetime
df_statistics['yearmonth'] = pd.to_datetime(df_statistics['yearmonth'])
df_pictures['added'] = pd.to_datetime(df_pictures['added']).dt.date
df_pictures['activated'] = pd.to_datetime(df_pictures['activated']).dt.date

In [None]:
# Number of unique listings ID
print(df_statistics.listing_id.nunique())
print(df_pictures.listing_id.nunique())

## Distributions of the features in Statistics

First, we group and sum the dataframe by yearmonth to get the complete result per month for all properties.

In [None]:
# Using groupby to summarize the statistics per yearmonth
statistic_per_month = df_statistics.groupby('yearmonth').sum().reset_index()

Now, we plot line plots for the views of the properties and the clicks on the telephone number.

In [None]:
# Line plot result views
fig, ax = plt.subplots(figsize=(15,5))
sns.lineplot(data=statistic_per_month, x="yearmonth", y="result_views")
sns.lineplot(data=statistic_per_month, x="yearmonth", y="expose_views")
sns.lineplot(data=statistic_per_month, x="yearmonth", y="phone_clicks")
ax.set(xlabel='Year - Month', ylabel='Views count')
plt.ticklabel_format(style='plain', axis='y')
plt.legend(labels=["Result views","Expose views", "Phone clicks"])
plt.title('Count of result views, expose views and phone clicks per month for all properties', fontsize=14);

Second, we group and sum the dataframe by listing ID to get the complete result per property for all month.

In [None]:
# Using groupby to summarize the statistics per property
statistic_per_id = df_statistics.groupby('listing_id').sum().reset_index()

Now, we plot boxplots to get an overview about the distribution of inquiry count, the views of the properties and the clicks on the telephone number.

In [None]:
# Boxplot inquiry count
statistic_per_id.boxplot(column=['inquiry_count'])

In [None]:
# Boxplot views and phone clicks
statistic_per_id.boxplot(column=['result_views', 'expose_views', 'phone_clicks'])

## Distributions of the features in Pictures

First, lets have a look on the picture resolution.

In [None]:
df_pictures.picture_resolution.hist()

Most of the pictures have a ultra-high-definition.

Lets have a look a the motives and motives by resolution.

In [None]:
# Histogram description
fig, ax = plt.subplots(figsize=(15,5))
sns.histplot(data=df_pictures, x="description");


In [None]:
# Histogram description by picture resolution
fig, ax = plt.subplots(figsize=(15,5))
sns.histplot(data=df_pictures, x="description", hue='picture_resolution', multiple="dodge", shrink=.8);

Create a new column with the difference between the added and activated date for the picture to plot this.

In [None]:
# Create new column
df_pictures['diff_add_activate'] = df_pictures.activated - df_pictures.added

In [None]:
# Description of new column diff_add_activate
df_pictures.diff_add_activate.describe()

The range of the new column diff_add_activate is wide. The minimum is a negative value. 

In [None]:
# Plotting the distribution after converting new column from timedelta to days
df_pictures['diff_add_activate'] = df_pictures['diff_add_activate'].dt.days
df_pictures.boxplot(column=['diff_add_activate']);

Which motives have the high difference between added and activated date?

In [None]:
# 95 % percentile as threshold
df_pictures.diff_add_activate.quantile(0.95)

In [None]:
# Filter and groupby with 95 % percentile
df_pictures.query('diff_add_activate > 3').groupby('description')['diff_add_activate'].count()

How many pictures have each unique property?

In [None]:
# Using groupby to summarize the statistics per yearmonth
picture_per_id = df_pictures.groupby('listing_id')['description'].count().reset_index()

In [None]:
# Description
picture_per_id.describe()

In [None]:
# Boxplot of distribution
picture_per_id.boxplot(column=['description'])

## First try of merging

First, we encoding the column picture resolution.

In [None]:
# Get one hot encoding of columns 'picture_resolution'
one_hot = pd.get_dummies(df_pictures['picture_resolution'])
# Drop column as it is now encoded
df_pictures = df_pictures.drop('picture_resolution',axis = 1)
# Join the encoded df
df_pictures = df_pictures.join(one_hot)

In [None]:
# Get one hot encoding of columns 'description'
one_hot_2 = pd.get_dummies(df_pictures['description'])
# Drop column as it is now encoded
df_pictures = df_pictures.drop('description',axis = 1)
# Join the encoded df
df_pictures = df_pictures.join(one_hot_2)

In [None]:
df_statistics.head(2)

In [None]:
df_pictures.head(2)

In [None]:
df_pictures.shape

In [None]:
test = df_pictures.groupby(['listing_id', 'activated']).sum().reset_index()

In [None]:
test

In [None]:
df_statistics