In [1]:
import numpy as np
from numpy import array 
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import pydotplus
import json

In [2]:
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
from cassandracsv import CassandraCsv

file_path= os.path.join(os.getcwd(),'concrete_strength-token.json')
with open(file_path) as f:
    file= json.load(f)
    id= file["clientId"]
    secret= file["secret"]

cloud_config= {
        'secure_connect_bundle': os.path.join(os.getcwd(), 'secure-connect-concrete-strength.zip')
                    }
auth_provider = PlainTextAuthProvider(id, secret)
cluster = Cluster(cloud=cloud_config, auth_provider=auth_provider)
session= cluster.connect()

row = session.execute("select release_version from system.local").one()
if row:
  print(row[0])
else:
  print("An error occurred.")

4.0.0.6816


In [3]:
session= cluster.connect('concrete_strength')

In [4]:
df = session.execute("SELECT * FROM concrete_strength_csv;")
df = pd.DataFrame([d for d in df])
df= df.astype(float)
df.head()

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,concrete_compressive_strength
0,212.0,0.0,124.8,159.0,7.8,1085.4,799.5,3.0,19.52
1,212.0,0.0,124.8,159.0,7.8,1085.4,799.5,14.0,31.35
2,212.0,0.0,124.8,159.0,7.8,1085.4,799.5,28.0,38.5
3,212.0,0.0,124.8,159.0,7.8,1085.4,799.5,56.0,45.08
4,212.0,0.0,124.8,159.0,7.8,1085.4,799.5,100.0,47.82


## FE

In [5]:
# removing outliers
from scipy import stats
df[(np.abs(stats.zscore(df)) < 3).all(axis=1)]

Unnamed: 0,cement,blast_furnace_slag,fly_ash,water,superplasticizer,coarse_aggregate,fine_aggregate,age,concrete_compressive_strength
0,212.0,0.0,124.8,159.0,7.8,1085.4,799.5,3.0,19.52
1,212.0,0.0,124.8,159.0,7.8,1085.4,799.5,14.0,31.35
2,212.0,0.0,124.8,159.0,7.8,1085.4,799.5,28.0,38.50
3,212.0,0.0,124.8,159.0,7.8,1085.4,799.5,56.0,45.08
4,212.0,0.0,124.8,159.0,7.8,1085.4,799.5,100.0,47.82
...,...,...,...,...,...,...,...,...,...
999,322.0,0.0,0.0,203.0,0.0,974.0,800.0,180.0,29.59
1000,322.0,0.0,116.0,196.0,10.0,818.0,813.0,28.0,31.18
1001,322.0,149.0,0.0,186.0,8.0,951.0,709.0,28.0,52.42
1002,355.0,19.0,97.0,145.0,12.3,967.0,871.0,28.0,55.45


# train_test_split

In [6]:
from sklearn.model_selection import train_test_split
## Independent and dependent features
X = df.drop(['concrete_compressive_strength'], axis=1)  
y = df["concrete_compressive_strength"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.2)

### make the data more gaussian distribution

In [7]:
from sklearn.preprocessing import PowerTransformer
pt=PowerTransformer()
X_train_transformed_arr=pt.fit_transform(X_train)
X_test_transformed_arr=pt.transform(X_test)

In [8]:
X_train_transformed=pd.DataFrame(X_train_transformed_arr,columns=X_train.columns)
X_test_transformed=pd.DataFrame(X_test_transformed_arr, columns=X_test.columns)

In [None]:
#visualising their QQPlots
for col in X.columns:
    plt.figure(figsize=(16,5))
    plt.subplot(2,2,1)
    stats.probplot(x=X_train[col],dist='norm',plot=plt)
    plt.title(col)
    plt.subplot(2,2,2)
    stats.probplot(x=X_train_transformed[col],dist='norm',plot=plt)
    plt.title(label=col+' After Transformation')
    plt.subplot(2,2,3)
    sns.kdeplot(x=X[col])
    plt.subplot(2,2,4)
    sns.kdeplot(x=X_train_transformed[col])
    plt.show()

In [10]:
X_train.skew()

cement                0.529870
blast_furnace_slag    0.879511
fly_ash               0.459894
water                 0.048939
superplasticizer      0.935382
coarse_aggregate     -0.085194
fine_aggregate       -0.308547
age                   3.372279
dtype: float64

In [11]:
X_train_transformed.skew()

cement               -0.015202
blast_furnace_slag    0.027731
fly_ash               0.136406
water                 0.006059
superplasticizer     -0.165540
coarse_aggregate     -0.021379
fine_aggregate       -0.020941
age                  -0.000368
dtype: float64

In [12]:
X_test.skew()

cement                0.693154
blast_furnace_slag    0.758093
fly_ash               0.647808
water                -0.037637
superplasticizer      1.143953
coarse_aggregate     -0.021656
fine_aggregate       -0.058306
age                   2.810204
dtype: float64

In [13]:
X_test_transformed.skew()

cement                0.042606
blast_furnace_slag   -0.106610
fly_ash               0.318083
water                -0.077176
superplasticizer      0.066254
coarse_aggregate      0.039382
fine_aggregate        0.292078
age                   0.102223
dtype: float64

# Selecting the best features for our model:
##### using SelectKBest method with score_func as chi2

In [14]:
from sklearn.feature_selection import SelectKBest, f_regression

In [15]:
select = SelectKBest(score_func=f_regression, k=7)
fs= select.fit_transform(X_train, y_train) # fitting & transorming train data
print("After selecting best 7 features:", fs.shape)

After selecting best 7 features: (803, 7)


In [16]:
filter = select.get_support()
features = array(X_train.columns)
 
print("All features:")
print(features)
print("                               ") 
print("Selected best 7:")
print(features[filter])

All features:
['cement' 'blast_furnace_slag' 'fly_ash' 'water' 'superplasticizer'
 'coarse_aggregate' 'fine_aggregate' 'age']
                               
Selected best 7:
['cement' 'blast_furnace_slag' 'water' 'superplasticizer'
 'coarse_aggregate' 'fine_aggregate' 'age']


We've selected 7 best features in x data. To identify the selected features we use get_support() function and filter out them from the features name list.  The fs object contains selected x data. 

In [17]:
X_train_new= X_train[features[filter]] # creating new X_train with selected features

In [18]:
X_train_new.columns

Index(['cement', 'blast_furnace_slag', 'water', 'superplasticizer',
       'coarse_aggregate', 'fine_aggregate', 'age'],
      dtype='object')

In [19]:
X_test_new= X_test[features[filter]] # creating new X_test with selected features

In [20]:
# Define which columns should be ordinal-encoded and which should be scaled
# categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(exclude='object').columns  # all our columns are numerical
# select numerical and catagorical datas

In [21]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
# if no outlyers use mean, if outlyers are there use median or mode
# for catagorical features use most frequent value
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import LabelEncoder # Label Encoding i.e., converting nominal catagorical features to numerical features(Feature Engineering)
# handling missing values-->feature scaling-->ordinal encoding....data should be handled in this order. This is what pipelining is all about
## pipelines: pipeline is combining multiple steps one after the other
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer # to group the pipelines together

In [22]:
## Numerical Pipeline
num_pipeline=Pipeline(
    steps=[
    ('imputer',SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())

    ]

)


# combine numerical and catagorical pipeline
preprocessor=ColumnTransformer([
('num_pipeline',num_pipeline,numerical_cols)
])


In [42]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train_transformed),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test_transformed),columns=preprocessor.get_feature_names_out())
# fit_transform training data, only transform for test data & convert to dataframe

In [43]:
filter = select.get_support()
features = array(X_train.columns)
X_train= X_train[features[filter]] # creating new X_train with selected features
X_test= X_test[features[filter]] # creating new X_test with selected features

In [44]:
X_train.head()

Unnamed: 0,num_pipeline__cement,num_pipeline__blast_furnace_slag,num_pipeline__water,num_pipeline__superplasticizer,num_pipeline__coarse_aggregate,num_pipeline__fine_aggregate,num_pipeline__age
0,-0.497057,0.65746,-1.945527,1.004294,1.363547,0.210166,0.719316
1,-0.957711,1.292025,0.175127,-1.229206,0.512907,-0.759652,-1.082654
2,-0.674994,1.017828,0.175127,-1.229206,1.370343,-0.215857,0.097287
3,-1.142154,0.5553,-1.111245,0.897231,1.432922,0.255916,1.247468
4,-0.103543,-1.027014,0.745621,-1.229206,-0.07198,1.16941,1.787541


In [45]:
from sklearn.linear_model import LinearRegression, Lasso,Ridge,ElasticNet   
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error     # parameters

In [46]:
lr=LinearRegression()
las=Lasso()
rid=Ridge()
eln=ElasticNet()
dtr=DecisionTreeRegressor()
svr=SVR()
rfr=RandomForestRegressor()
gbr=GradientBoostingRegressor()
abr=AdaBoostRegressor()

model_dict={'lr':lr, 'las':las, 'rid':rid, 'eln': eln, 'dtr':dtr,'svr':svr,'rfr':rfr,'gbr':gbr,'abr':abr}

In [47]:
values=[]
names=[]
def model_train_output(name,model,X_train,X_test,y_train,y_test):
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    r2_value=r2_score(y_test,y_pred)
    names.append(name)
    values.append(r2_value)

    return names,values

In [48]:
for name, model in  model_dict.items():
    names,r2_score_val=model_train_output(name,model,X_train_transformed,X_test_transformed,y_train,y_test)

In [49]:
new_df=pd.DataFrame(names,columns=['Model'])
new_df['r2_score']=r2_score_val
new_df

Unnamed: 0,Model,r2_score
0,lr,0.834598
1,las,0.818574
2,rid,0.834638
3,eln,0.714997
4,dtr,0.887036
5,svr,0.792358
6,rfr,0.926421
7,gbr,0.908118
8,abr,0.800976


In [50]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square
    # function to find all the error and fitness parameters to evaluate the models

In [51]:
## Train multiple models

models={
    'lr': LinearRegression(),
    'las': Lasso(),
    'rid': Ridge(),
    'eln': ElasticNet(),
    'dtr': DecisionTreeRegressor(),
    'svr': SVR(),
    'rfr': RandomForestRegressor(),
    'gbr': GradientBoostingRegressor(),
    'abr': AdaBoostRegressor()
}
trained_model_list=[]
model_list=[]
r2_list=[]

for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train,y_train)

    #Make Predictions
    y_pred=model.predict(X_test)

    mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print('Model Training Performance')
    print("RMSE:",rmse)
    print("MAE:",mae)
    print("R2 score",r2_square*100)

    r2_list.append(r2_square)
    
    print('='*35)
    print('\n')


lr
Model Training Performance
RMSE: 6.991945654105633
MAE: 5.498283432134383
R2 score 83.08751358268431


las
Model Training Performance
RMSE: 7.2417188903157035
MAE: 5.736978056185568
R2 score 81.85760169302858


rid
Model Training Performance
RMSE: 6.989016990540932
MAE: 5.496662976313408
R2 score 83.10167862682911


eln
Model Training Performance
RMSE: 9.07125195167231
MAE: 7.584938012767427
R2 score 71.5327292559423


dtr
Model Training Performance
RMSE: 5.586214920615164
MAE: 3.754203980099502
R2 score 89.20440325495407


svr
Model Training Performance
RMSE: 7.847816270265189
MAE: 5.731265718094746
R2 score 78.69365148135607


rfr
Model Training Performance
RMSE: 4.556158740596278
MAE: 3.3112418046671404
R2 score 92.8186008407593


gbr
Model Training Performance
RMSE: 5.091968221806209
MAE: 3.714275410688825
R2 score 91.03020036455779


abr
Model Training Performance
RMSE: 7.69075438592126
MAE: 6.375786258069195
R2 score 79.53794458771476




In [52]:
model_list

['lr', 'las', 'rid', 'eln', 'dtr', 'svr', 'rfr', 'gbr', 'abr']