# Project: Concrete Strength Prediction

#### Import packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.interactiveshell import InteractiveShell

# Typically would use inline
# but doing this for a 3d chart
# in the bivariate analysis section
%matplotlib notebook

InteractiveShell.ast_node_interactivity = 'all'

plt.rc('figure', max_open_warning=0)

#### Set package options

In [2]:
sns.set(color_codes=True)
sns.set_style(style='darkgrid')
palette = 'Set2'
pd.set_option('display.max_columns', None)

#### Import dataset; one for raw data, one to edited/imputed data 

In [3]:
raw_data = pd.read_csv('concrete.csv')
data = pd.read_csv('concrete.csv')

#### Methods for use elsewhere

In [4]:
def dist(col):
    plt.figure()
    sns.distplot(col);
    
def hist(col):
    plt.figure()
    plt.hist(col)
    plt.axvline(col.mean(), color='y', linewidth=2, label='Mean')
    plt.axvline(col.median(), color='g', linewidth=2, label='Median')
    plt.legend();
    
def box(col):
    plt.figure()
    sns.boxplot(col);
    
def print_summary(col):
    dist(col)
    hist(col)
    box(col)
    print(col.describe())
    print('')
    print('Unique values: ' + str(col.nunique()))
    
def marginal_boxplot_margins(a, vertical=False, **kws):
    if vertical:
        sns.boxplot(y=a, palette='Accent', **kws)
    else:
        sns.boxplot(x=a, palette='Accent_r', **kws)

def marginal_boxplot(xcol, ycol):
    g = sns.JointGrid(data=raw_data, x=xcol, y=ycol);
    g.plot_joint(sns.regplot, lowess=True, truncate=False, scatter_kws={'alpha':.2});
    g.plot_marginals(marginal_boxplot_margins);

## Generic data analysis

In [5]:
raw_data.head(10)

Unnamed: 0,cement,slag,ash,water,superplastic,coarseagg,fineagg,age,strength
0,141.3,212.0,0.0,203.5,0.0,971.8,748.5,28,29.89
1,168.9,42.2,124.3,158.3,10.8,1080.8,796.2,14,23.51
2,250.0,0.0,95.7,187.4,5.5,956.9,861.2,28,29.22
3,266.0,114.0,0.0,228.0,0.0,932.0,670.0,28,45.85
4,154.8,183.4,0.0,193.3,9.1,1047.4,696.7,28,18.29
5,255.0,0.0,0.0,192.0,0.0,889.8,945.0,90,21.86
6,166.8,250.2,0.0,203.5,0.0,975.6,692.6,7,15.75
7,251.4,0.0,118.3,188.5,6.4,1028.4,757.7,56,36.64
8,296.0,0.0,0.0,192.0,0.0,1085.0,765.0,28,21.65
9,155.0,184.0,143.0,194.0,9.0,880.0,699.0,28,28.99


In [6]:
raw_data.shape

(1030, 9)

In [7]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1030 entries, 0 to 1029
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   cement        1030 non-null   float64
 1   slag          1030 non-null   float64
 2   ash           1030 non-null   float64
 3   water         1030 non-null   float64
 4   superplastic  1030 non-null   float64
 5   coarseagg     1030 non-null   float64
 6   fineagg       1030 non-null   float64
 7   age           1030 non-null   int64  
 8   strength      1030 non-null   float64
dtypes: float64(8), int64(1)
memory usage: 72.5 KB


In [8]:
raw_data.describe()

Unnamed: 0,cement,slag,ash,water,superplastic,coarseagg,fineagg,age,strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


## Univariate analysis

### Cement (cement)

In [9]:
print_summary(raw_data['cement'])

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

count    1030.000000
mean      281.167864
std       104.506364
min       102.000000
25%       192.375000
50%       272.900000
75%       350.000000
max       540.000000
Name: cement, dtype: float64

Unique values: 278


Distribution of `cement` has a very slight right-hand skew.

### Blast Furance Slag (slag)

In [10]:
print_summary(raw_data['slag'])

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

count    1030.000000
mean       73.895825
std        86.279342
min         0.000000
25%         0.000000
50%        22.000000
75%       142.950000
max       359.400000
Name: slag, dtype: float64

Unique values: 185


Distribution of `slag` has a very large right hand skew.

### Fly Ash (ash)

In [11]:
print_summary(raw_data['ash'])

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

count    1030.000000
mean       54.188350
std        63.997004
min         0.000000
25%         0.000000
50%         0.000000
75%       118.300000
max       200.100000
Name: ash, dtype: float64

Unique values: 156


Distribution of `ash` has a very strong right-hand skew, however this is largely due to most samples have no ash content.

***What does it look like if we only look at samples containing some ash?***

In [12]:
print_summary(raw_data[raw_data['ash'] > 0]['ash'])

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

count    464.000000
mean     120.288793
std       33.675470
min       24.500000
25%       97.850000
50%      121.400000
75%      141.000000
max      200.100000
Name: ash, dtype: float64

Unique values: 155


If we look at the distribution of `ash` in samples with non-zero levels of `ash`, we see a moderate left-hand skew.

### Water (water)

In [13]:
print_summary(raw_data['water'])

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

count    1030.000000
mean      181.567282
std        21.354219
min       121.800000
25%       164.900000
50%       185.000000
75%       192.000000
max       247.000000
Name: water, dtype: float64

Unique values: 195


Distribution of `water` has a strong left-hand skew but also several large outliers.

### Superplasticizer (superplastic)

In [14]:
print_summary(raw_data['superplastic'])

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

count    1030.000000
mean        6.204660
std         5.973841
min         0.000000
25%         0.000000
50%         6.400000
75%        10.200000
max        32.200000
Name: superplastic, dtype: float64

Unique values: 111


Distribution of `superplastic` has a strong right skew with a few large outliers.

### Coarse Aggregate (coarseagg)

In [15]:
print_summary(raw_data['coarseagg'])

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

count    1030.000000
mean      972.918932
std        77.753954
min       801.000000
25%       932.000000
50%       968.000000
75%      1029.400000
max      1145.000000
Name: coarseagg, dtype: float64

Unique values: 284


Distribution of `courseagg` has a slight right skew and no outliers.

### Fine Aggregate (fineagg)

In [16]:
print_summary(raw_data['fineagg'])

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

count    1030.000000
mean      773.580485
std        80.175980
min       594.000000
25%       730.950000
50%       779.500000
75%       824.000000
max       992.600000
Name: fineagg, dtype: float64

Unique values: 302


Distribution of `fineagg` does not have a noticeable skew.

### Age (age)

In [17]:
print_summary(raw_data['age'])

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

count    1030.000000
mean       45.662136
std        63.169912
min         1.000000
25%         7.000000
50%        28.000000
75%        56.000000
max       365.000000
Name: age, dtype: float64

Unique values: 14


Distribution of `age` has a right skew with several large outliers for particularly older samples.

### Compressive Strength (strength)

In [18]:
print_summary(raw_data['strength'])

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

count    1030.000000
mean       35.817961
std        16.705742
min         2.330000
25%        23.710000
50%        34.445000
75%        46.135000
max        82.600000
Name: strength, dtype: float64

Unique values: 845


Distribution of `strength` has a slight right skew with a few large outliers.

## Bivariate analyses

In [19]:
g = sns.PairGrid(raw_data);

g.map_upper(sns.scatterplot);
g.map_diag(sns.kdeplot);
g.map_lower(sns.kdeplot);

<IPython.core.display.Javascript object>

In [33]:
plt.figure()
sns.heatmap(raw_data.corr(), annot=True, cmap='Blues', vmin=-1, vmax=1, center=0, fmt='.3f');

<IPython.core.display.Javascript object>

The strongest positive correlations seem to be between (1) strength and cement content, (2) ash content and superplastic content, and (3) superplastic content and strength.

In [21]:
marginal_boxplot('strength', 'cement')

<IPython.core.display.Javascript object>

In [22]:
marginal_boxplot('ash', 'superplastic')

<IPython.core.display.Javascript object>

In [23]:
marginal_boxplot('strength', 'superplastic')

<IPython.core.display.Javascript object>

The strongest negative correlations seem to be between (1) superplastic content and water content, (2) fine aggregate content and water content, and (3) cement content and ash content.

In [24]:
marginal_boxplot('superplastic', 'water')

<IPython.core.display.Javascript object>

In [25]:
marginal_boxplot('fineagg', 'water')

<IPython.core.display.Javascript object>

In [26]:
marginal_boxplot('cement', 'ash')

<IPython.core.display.Javascript object>

The strongest positive correlations are centered around cement content, superplastics content, and strength. Another is ash, which is positively correlated with superplastics but *negatively* correlated with strength. So what does the relationship between cement, superplastics, and ash look like?

For this we'll utilize a 3-dimensional scatterplot.

In [27]:
from mpl_toolkits.mplot3d import Axes3D

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

x = raw_data['cement']
y = raw_data['superplastic']
z = raw_data['ash']

ax.scatter(x, y, z, c='r', marker='o')

ax.set_xlabel('Cement Content')
ax.set_ylabel('Superplastics Content')
ax.set_zlabel('Ash Content')

plt.show();

<IPython.core.display.Javascript object>

In [34]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

z = raw_data['strength']

ax.scatter(x, y, z, c='r', marker='o')

ax.set_xlabel('Cement Content')
ax.set_ylabel('Superplastics Content')
ax.set_zlabel('Strength')

plt.show();

<IPython.core.display.Javascript object>