# EDA for Picture Dataset

Initial look at data and data cleaning


## Import necassary libraries


In [None]:
# Import necassary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import csv
import shutil
from tempfile import NamedTemporaryFile

Loading Pictures CSV file

In [None]:
import os
ROOT = os.environ.get('PWD')

In [None]:
#Load dataset
df_pictures = pd.read_csv('../data/listing_pictures_0210707.csv')


In [None]:
df_pictures.head()

## Correcting date and time formatting.

In [None]:
#set date/time
df_pictures['added'] = pd.to_datetime(df_pictures['added'])
df_pictures['activated'] = pd.to_datetime(df_pictures['activated'])

In [None]:
df_pictures.info()

In [None]:
df_pictures.describe().round()

In [None]:
df_pictures.isna().sum()

In [None]:
df_pictures.nunique()

In [None]:
df_pictures.format.unique()

## Format of picture is unlikely to be a predictor of people chosing to rent.

Removed format column because it is not functional for this EDA

In [None]:
del df_pictures['format']

In [None]:
df_pictures.head()

In [None]:
# Step 0 - Read the dataset, calculate column correlations and make a seaborn heatmap
corr = df_pictures.corr()
ax = sns.heatmap(
    corr,
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);

In [None]:
corr.style.background_gradient(cmap='coolwarm')

In [None]:
df_pictures.description.unique()

In [None]:
df_pictures.description.value_counts()

## Creating column defining picture quality


In [None]:
sns.scatterplot(x=df_pictures.width, y=df_pictures.height)

In [None]:
df_pictures.query('width > 17_000') 

In [None]:
def catagorizer(a, b):
    if a + b < (720+480):
        return "Poor Resolution"
    elif a + b >= (720+480) and a + b <= (1280+720):
        return "High-definition"
    else:
        return "Ultra-high-definition"

In [None]:
df_pictures['picture_resolution']= df_pictures.apply(lambda x: catagorizer(
    x['width'], x['height']), axis=1)

In [None]:
df_pictures.head()


## Dropping height and width

In [None]:
del df_pictures['width']
del df_pictures['height']
df_pictures.head()

## Saving csv file

In [None]:
filename = '../data/listing_pictures_0210707.csv'
temp_file= NamedTemporaryFile(delete=False)
with open(filename, "rb") as csvfile, temp_file:
    reader = csv.DictReader(csvfile)
    fieldnames = ['listing_id', 'added', 'activated', 'description', 'start_pic_summer', 'start_pic_winter', 'Picture_resolution']
    writer = csv.DictWriter(temp_file, fieldnames=fieldnames) 
    writer.writeheader()
    print(temp_file.name)
    for row in reader:
        writer.writerow({
            "listing_id": row["listing_id"],
            "added": row["added"],
            "activated": row["activated"],
            "description": row["description"],
            "start_pic_summer": row["start_pic_summer"],
            "start_pic_winter": row["start_pic_winter"],
            "Picture_resolution": row["Picture_resolution"],
        })