# Boat Sales Analysis2

### This script contains the following:

#### 1. Importing Visualization Libraries and Data
#### 2. Data Manipulating and  Creating new columns for analysis

## 1. Importing Libraries and Data

In [1]:
import quandl
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import statsmodels.api as sm 
import os
import warnings 

warnings.filterwarnings("ignore")
plt.style.use('fivethirtyeight') 

In [2]:
# Path to the original data
path= r'/Users/mentaykoshzhanova/Desktop/boat project'

In [3]:
# Using path to create df 
df = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'clean_boat.csv'), index_col = False)

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Boat Type,Manufacturer,Condition,Year Built,Length,Width,Material,Number of views last 7 days,Price in usd,Country
0,0,Motor Yacht,Rigiflex power boats,new boat from stock,2017,4.0,1.9,GRP,226,3704.07,Switzerland
1,1,Center console boat,Terhi power boats,new boat from stock,2020,4.0,1.5,Thermoplastic,75,3734.3,Germany
2,2,Sport Boat,Marine power boats,new boat from stock,0,3.69,1.42,Aluminium,124,4184.7,Switzerland
3,3,Sport Boat,Pioner power boats,new boat from stock,2020,3.0,1.0,GRP,64,3626.0,Denmark
4,4,Fishing Boat,Linder power boats,new boat from stock,2019,3.55,1.46,Aluminium,58,3636.93,Germany


In [5]:
df.shape

(7420, 11)

In [6]:
df.describe()

Unnamed: 0.1,Unnamed: 0,Year Built,Length,Width,Number of views last 7 days,Price in usd
count,7420.0,7420.0,7420.0,7420.0,7420.0,7420.0
mean,4966.000809,1884.982884,10.718402,3.367708,158.178302,247087.9
std,2905.604118,480.278326,5.044519,1.124015,162.1983,887384.9
min,0.0,0.0,1.7,0.74,15.0,3531.0
25%,2356.75,1998.0,7.13,2.51,72.0,44940.0
50%,5100.0,2008.0,9.6,3.1,112.0,95230.0
75%,7499.25,2018.0,13.11,4.1,185.0,212930.0
max,9887.0,2021.0,56.0,25.16,3263.0,33170000.0


In [7]:
# check Year Built column with 0 values count
df[df['Year Built'] ==0].count()[0]

452

## 2. Data Manipulation

In [8]:
df_clean = df.copy()

In [9]:
# Add new column with price levels (low, medium, high and very high):
df_clean['Price in usd'].describe()


count    7.420000e+03
mean     2.470879e+05
std      8.873849e+05
min      3.531000e+03
25%      4.494000e+04
50%      9.523000e+04
75%      2.129300e+05
max      3.317000e+07
Name: Price in usd, dtype: float64

In [10]:
# Splitting into columns
min = df_clean['Price in usd'].min()
first_q = df_clean['Price in usd'].quantile(0.25)
third_q = df_clean['Price in usd'].quantile(0.75)
median = df_clean['Price in usd'].median()
max = df_clean['Price in usd'].max()

In [11]:
# Creating flags
bins = [min, first_q, median, third_q, max]
labels = ['Low', 'Medium', 'High', 'Very High']
df_clean['Price Level'] = pd.cut(df_clean['Price in usd'], bins, labels = labels, include_lowest=True, right=True)

In [12]:
# Checking the changes
df_clean['Price Level'].value_counts()

Price Level
Medium       1889
Low          1860
Very High    1853
High         1818
Name: count, dtype: int64

In [13]:
# Adding new column with view levels (low, medium, high and very high):
df_clean['Number of views last 7 days'].describe()

count    7420.000000
mean      158.178302
std       162.198300
min        15.000000
25%        72.000000
50%       112.000000
75%       185.000000
max      3263.000000
Name: Number of views last 7 days, dtype: float64

In [14]:
# Splitting into columns
min = df_clean['Number of views last 7 days'].min()
first_q = df_clean['Number of views last 7 days'].quantile(0.25)
third_q = df_clean['Number of views last 7 days'].quantile(0.75)
median = df_clean['Number of views last 7 days'].median()
max = df_clean['Number of views last 7 days'].max()

In [15]:
# Creating flags
bins2 = [min, first_q, median, third_q, max]
labels2 = ['Low', 'Medium', 'High', 'Very High']
df_clean['Views Level'] = pd.cut(df_clean['Number of views last 7 days'], bins2, labels = labels2, include_lowest=True, right=True)

In [16]:
# Checking the changes
df_clean['Views Level'].value_counts()

Views Level
Low          1883
High         1853
Very High    1850
Medium       1834
Name: count, dtype: int64

In [17]:
# check top 10 boat types which has most views

views = df.groupby('Condition')['Number of views last 7 days'].mean().sort_values(ascending=False)
views

Condition
new boat from stock,Hybrid      384.000000
Used boat,Unleaded              247.985477
Display Model,Diesel            234.263158
Used boat,Propane               224.000000
Display Model                   223.666667
new boat on order,Diesel        218.230769
Used boat,Gas                   202.222222
Used boat,Electric              196.375000
new boat on order               190.295455
Display Model,Unleaded          188.459459
Display Model,Electric          185.000000
Unleaded                        179.266667
Electric                        167.000000
new boat from stock,Electric    160.941176
new boat from stock             141.628676
Used boat,Diesel                137.286252
Used boat                       130.708437
new boat on order,Unleaded      130.195804
Diesel                          128.560000
new boat from stock,Gas         126.000000
new boat from stock,Unleaded    124.588639
new boat from stock,Diesel      122.613546
Name: Number of views last 7 days, dtype: fl

In [18]:
df_clean.head()

Unnamed: 0.1,Unnamed: 0,Boat Type,Manufacturer,Condition,Year Built,Length,Width,Material,Number of views last 7 days,Price in usd,Country,Price Level,Views Level
0,0,Motor Yacht,Rigiflex power boats,new boat from stock,2017,4.0,1.9,GRP,226,3704.07,Switzerland,Low,Very High
1,1,Center console boat,Terhi power boats,new boat from stock,2020,4.0,1.5,Thermoplastic,75,3734.3,Germany,Low,Medium
2,2,Sport Boat,Marine power boats,new boat from stock,0,3.69,1.42,Aluminium,124,4184.7,Switzerland,Low,High
3,3,Sport Boat,Pioner power boats,new boat from stock,2020,3.0,1.0,GRP,64,3626.0,Denmark,Low,Low
4,4,Fishing Boat,Linder power boats,new boat from stock,2019,3.55,1.46,Aluminium,58,3636.93,Germany,Low,Low


In [19]:
#Exporting to csv
df_clean.to_csv(os.path.join(path, '02 Data','Prepared Data', 'clean_boat2.csv'))