# EDA for Prices Dataset

Initial look at data and data cleaning


## Import necassary libraries

In [None]:
# Import necassary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.formula.api as smf
from scipy import stats

In [None]:
#Load dataset
df_prices = pd.read_csv('../data/prices_20210713.csv')

In [None]:
df_prices.head()

## Correcting date and time formatting.

In [None]:
df_prices['date_from'] = pd.to_datetime(df_prices['date_from'], errors = 'coerce')
df_prices['date_to'] = pd.to_datetime(df_prices['date_to'], errors = 'coerce')


In [None]:
df_prices.query('price_per_day_from <= 3') 

In [None]:
df_prices.head()

In [None]:
df_prices.describe().round()

In [None]:
# Step 0 - Read the dataset, calculate column correlations and make a seaborn heatmap
corr = df_prices.corr()
ax = sns.heatmap(
    corr,
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right');

## Translating "price per day" to "price per week", and visa versa.

In [None]:
df_prices['filled_in_price_per_day']= df_prices.price_per_day_from.fillna(
    (df_prices.price_per_week_from)/7) 
df_prices['filled_in_price_per_week'] = df_prices.price_per_week_from.fillna(
    (df_prices.price_per_day_from)*7) 
df_prices.head()

## Checking work

In [None]:

df_prices['price_per_week_from'].isna().value_counts()

In [None]:
df_prices['filled_in_price_per_week'].isna().value_counts()

Not Nan should be reduced but still exist when there was not price set in either column.

## Validating whether 999 and 9999 are real values

In [None]:
df_prices['filled_in_price_per_day'].isna().value_counts()

In [None]:
# Step 0 - Read the dataset, calculate column correlations and make a seaborn heatmap
corr = df_prices.corr()
ax = sns.heatmap(
    corr,
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right');

In [None]:
df_prices.query('filled_in_price_per_day == 9999 or filled_in_price_per_day == 999') 

In [None]:
df_prices.query('filled_in_price_per_day == 9999 or filled_in_price_per_day == 999').count()

In [None]:
smf.ols(formula='price_per_week_from ~ price_per_day_from', data=df_prices).fit().summary()

As only 21 entries were 999 or 9999 was only in the weeks column, we have determined that 999 and 9999 are common placeholder values in this data set. 

## Checking that there are not an abundance of placeholder values (i.e. 0 or 1).

In [None]:
df_prices.query('filled_in_price_per_day == 1234').count()

## Replacing all values of 0, 1, 999, and 9999.

In [None]:
df_prices['filled_in_price_per_day']= df_prices.filled_in_price_per_day.replace(
    [0, 1, 999, 9999], [np.NaN, np.NaN, np.NaN, np.NaN])
df_prices['filled_in_price_per_week']= df_prices.filled_in_price_per_week.replace(
    [0, 1, 999, 9999], [np.NaN, np.NaN, np.NaN, np.NaN])

## Searching for nonsense prices

In [None]:
bad_data = df_prices.filled_in_price_per_day >= df_prices.filled_in_price_per_week
bad_data.value_counts()

In [None]:
df_prices.query('filled_in_price_per_day >= 5000')

In [None]:
df_prices.head()

## Deleting bad data

In [None]:
# Get names of indexes for which column filled_in_price_per_day has value under 10
indexNames = df_prices[df_prices['filled_in_price_per_day'] <= 10 ].index
# Delete these row indexes from dataFrame
df_prices.drop(indexNames , inplace=True)

In [None]:
# Get names of indexes for which column filled_in_price_per_day has value greater than or equal to filled_in_price_per_week
indexNames2 = df_prices[df_prices['filled_in_price_per_day'] >= df_prices['filled_in_price_per_week']].index
# Delete these row indexes from dataFrame
df_prices.drop(indexNames2 , inplace=True)

In [None]:
# Get names of indexes for which column filled_in_price_per_week over 1_000_000
indexNames = df_prices[df_prices['filled_in_price_per_week'] >= 1_000_000 ].index
# Delete these row indexes from dataFrame
df_prices.drop(indexNames , inplace=True)

In [None]:
df_prices.head()

## Checking high values

In [None]:
df_prices.query('filled_in_price_per_day >= 5000')

## Looking at date information

In [None]:
pd.to_datetime(df_prices.date_from)

In [None]:
"""df_prices['month'] = pd.to_datetime(df_prices['date_from']).dt.to_period('M')
fig, ax = plt.subplots(figsize=(15,5))
sns.lineplot(data=df_prices, x="date_from", y="filled_in_price_per_day")"""

## Checking correlation between weeks and days

In [None]:
# Step 0 - Read the dataset, calculate column correlations and make a seaborn heatmap
corr = df_prices.corr()
ax = sns.heatmap(
    corr,
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right');

In [None]:
corr

In [None]:
sns.scatterplot(x=df_prices.filled_in_price_per_week, y=df_prices.filled_in_price_per_day)

In [None]:
df_prices.filled_in_price_per_week.max()

## Creating price_catagory column

In [None]:
#df_prices['price_catagory']= df_prices.price_per_day_from

In [None]:
def catagorizer(a, b):
    if np.isnan(a):
        if np.isnan(b):
            return "No_info"
        else:
            return "Week"
    else:
        if np.isnan(b):
            return "Day"
    return "Both"


In [None]:

df_prices['price_catagory']= df_prices.apply(lambda x: catagorizer(
    x['price_per_day_from'], x['price_per_week_from']), axis=1)

In [None]:
df_prices['price_catagory'].unique()

In [None]:
df_prices['price_catagory'].value_counts()

In [None]:
df_prices[df_prices['price_catagory'] == "No_info"]

## Deleting price_per_day_from and price_per_week_from

In [None]:
del df_prices['price_per_day_from']

In [None]:
del df_prices['price_per_week_from']

In [None]:
df_prices.head()