In [1]:
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'data-448'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'Chapter6/Credit_Card.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
credit = pd.read_csv(file_content_stream)
credit = credit.drop(columns = ['ID'], axis = 1)
credit.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,20000.0,2,2,1,24,2,2,-1,-1,-2,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,120000.0,2,2,2,26,-1,2,0,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,90000.0,2,2,2,34,0,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,50000.0,2,2,1,37,0,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,50000.0,1,2,1,57,-1,0,-1,0,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [19]:
credit.columns

Index(['LIMIT_BAL', 'SEX', 'EDUCATION', 'MARRIAGE', 'AGE', 'PAY_0', 'PAY_2',
       'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2',
       'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1',
       'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6',
       'default.payment.next.month'],
      dtype='object')

In [20]:
credit.shape

(30000, 24)

In [21]:
credit['default.payment.next.month'].value_counts() / credit.shape[0]

0    0.7788
1    0.2212
Name: default.payment.next.month, dtype: float64

## Splitting the data

In [22]:
## Defining the input and target variables
X = credit.drop(columns = ['default.payment.next.month'], axis = 1)
Y = credit['default.payment.next.month']

## Splitting the data into train and validation
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size = 0.2, stratify = Y)

## Splitting into validation and test
X_val, X_test, Y_val, Y_test = train_test_split(X_val, Y_val, test_size = 0.5, stratify = Y_val)

## Standardizing the data

In [23]:
## Loading minmax
scaler = MinMaxScaler()

## Transforming to 0-1 the inputs 
X_train = scaler.fit_transform(X_train)
X_val = scaler.fit_transform(X_val)
X_test = scaler.fit_transform(X_test)

## Buiding the base learners

In [29]:
## Decision tree
tree = DecisionTreeClassifier(max_depth = 3).fit(X_train, Y_train)
pred_tree = tree.predict_proba(X_val)[:, 1]

## Nearest-Neighbors
knn = KNeighborsClassifier(n_neighbors = 5).fit(X_train, Y_train)
pred_knn = knn.predict_proba(X_val)[:, 1]

## support vector machine 
svm = SVC(kernel = 'rbf', probability = True).fit(X_train, Y_train)
pred_svm = svm.predict_proba(X_val)[:, 1]

## Creating the new inputs and target

In [30]:
## Creating a data-frame with the base learners predictions
X_preds = pd.DataFrame({'tree': pred_tree, 'knn': pred_knn, 'svm': pred_svm})
X_preds.head()

Unnamed: 0,tree,knn,svm
0,0.11451,0.0,0.160942
1,0.11451,0.0,0.181479
2,0.11451,0.2,0.171468
3,0.215915,0.2,0.178212
4,0.215915,0.4,0.210031


## Building the meta-learner

In [31]:
## building the meta-learner
meta_learner = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_preds, Y_val)

## Predicting on the test datasets with base learners

In [33]:
## Predicting on the test datasets
test_pred_tree = tree.predict_proba(X_test)[:, 1]
test_pred_knn = knn.predict_proba(X_test)[:, 1]
test_pred_svm = svm.predict_proba(X_test)[:, 1]

## Creating a data-frame with base learners 
X_test_preds = pd.DataFrame({'tree': test_pred_tree, 'knn': test_pred_knn, 'svm': test_pred_svm})
X_test_preds.head()

Unnamed: 0,tree,knn,svm
0,0.11451,0.0,0.173227
1,0.11451,0.2,0.191382
2,0.11451,0.0,0.171401
3,0.11451,0.2,0.13862
4,0.11451,0.2,0.186879


In [36]:
## Changing likelihoods to labels
test_pred_tree_label = np.where(test_pred_tree < 0.25, 0, 1) 
test_pred_knn_label = np.where(test_pred_knn < 0.25, 0, 1)
test_pred_svm_label = np.where(test_pred_svm < 0.25, 0, 1)

## Classification report of decision tree
print(classification_report(Y_test, test_pred_tree_label))

## Classification report of knn
print(classification_report(Y_test, test_pred_knn_label))

## Classification report of svm
print(classification_report(Y_test, test_pred_svm_label))

              precision    recall  f1-score   support

           0       0.85      0.90      0.88      2337
           1       0.57      0.46      0.51       663

    accuracy                           0.80      3000
   macro avg       0.71      0.68      0.69      3000
weighted avg       0.79      0.80      0.80      3000

              precision    recall  f1-score   support

           0       0.85      0.78      0.81      2337
           1       0.40      0.52      0.45       663

    accuracy                           0.72      3000
   macro avg       0.63      0.65      0.63      3000
weighted avg       0.75      0.72      0.73      3000

              precision    recall  f1-score   support

           0       0.85      0.92      0.88      2337
           1       0.59      0.42      0.49       663

    accuracy                           0.81      3000
   macro avg       0.72      0.67      0.68      3000
weighted avg       0.79      0.81      0.79      3000



## Predicting on the test with meta-learner

In [37]:
## using the meta-learner to make predictions on test
meta_learner_pred = meta_learner.predict_proba(X_test_preds)[:, 1]

## Changing likelihood to labels
meta_learner_pred_label = np.where(meta_learner_pred < 0.25, 0, 1)

## Classification report of meta-learner
print(classification_report(Y_test, meta_learner_pred_label))

              precision    recall  f1-score   support

           0       0.86      0.88      0.87      2337
           1       0.55      0.49      0.52       663

    accuracy                           0.80      3000
   macro avg       0.70      0.69      0.70      3000
weighted avg       0.79      0.80      0.79      3000



## Homogeneous Ensembles

In [4]:
import boto3
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

## Defining the s3 bucket
s3 = boto3.resource('s3')
bucket_name = 'data-448'
bucket = s3.Bucket(bucket_name)

## Defining the file to be read from s3 bucket
file_key = 'Chapter6/energy_data.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

## Reading the csv file
energy = pd.read_csv(file_content_stream)
energy.head()

Unnamed: 0,date,Appliances,lights,T1,RH_1,T2,RH_2,T3,RH_3,T4,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,1/11/16 17:00,60,30,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,1/11/16 17:10,60,30,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,...,17.066667,45.56,6.48,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,1/11/16 17:20,50,30,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,...,17.0,45.5,6.37,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,1/11/16 17:30,50,40,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.41039,45.41039
4,1/11/16 17:40,60,40,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,...,17.0,45.4,6.13,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097


In [5]:
energy.shape

(19735, 29)

In [6]:
## Defining the input and target variables
X = energy.drop(columns = ['date', 'Appliances', 'lights', 'rv2'], axis = 1)
Y = energy['rv2']

## Splitting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

## Standardizing the data
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X_train.columns)
X_test = pd.DataFrame(scaler.fit_transform(X_test), columns = X_test.columns)

## Defining the list to store predictions
svm_preds = list()

## Defining the training dataset
train_data = pd.concat([X_train, Y_train], axis = 1)

for i in range(0, 2):
    
    ## Sampling the training dataset
    train_data_loop = train_data.sample(frac = 0.8)
    X_train_loop = train_data_loop.drop(columns = ['rv2'], axis = 1)
    Y_train_loop = train_data_loop['rv2']
    
    ## Building the svm model
    svm_md = SVR(kernel = 'rbf').fit(X_train_loop, Y_train_loop)
    
    ## Storing predictions 
    svm_preds.append(svm_md.predict(X_test))
    
## Putting all the prediction in a data-frame 
svm_preds = pd.DataFrame(svm_preds).T

## Aggregation the predictions
svm_preds['Final_Pred'] = svm_preds.apply(np.mean, axis = 1)
print(svm_preds.head())

## Computing the mse of each model 


TypeError: cannot concatenate object of type '<class 'numpy.ndarray'>'; only Series and DataFrame objs are valid

In [12]:
RF_preds.apply(mean_squared_error, axis = 0)

TypeError: mean_squared_error() missing 1 required positional argument: 'y_pred'

In [7]:
X.head()

Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,...,RH_8,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1
0,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,55.2,...,48.9,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433
1,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,55.2,...,48.863333,17.066667,45.56,6.48,733.6,92.0,6.666667,59.166667,5.2,18.606195
2,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,55.09,...,48.73,17.0,45.5,6.37,733.7,92.0,6.333333,55.333333,5.1,28.642668
3,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,55.09,...,48.59,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.41039
4,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,55.09,...,48.59,17.0,45.4,6.13,733.9,92.0,5.666667,47.666667,4.9,10.084097


In [4]:
X_train.head()

Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
17873,24.7,32.06,22.628571,31.818571,24.0,33.29,24.29,30.5,24.16,38.29,...,23.06,30.214,9.72,759.05,53.333333,6.166667,40.0,0.6,37.509166,37.509166
13175,21.79,42.7,19.2,44.73,23.426667,40.53,21.79,39.59,20.79,44.225714,...,20.823333,44.7,6.28,751.566667,90.5,1.166667,40.0,4.9,46.024348,46.024348
19069,24.2,47.59,21.79,50.0,26.29,44.56,23.6,45.5,22.89,51.4,...,23.29,49.7,11.1,752.883333,94.166667,3.833333,61.833333,10.2,8.065671,8.065671
242,18.9975,44.3975,18.26,43.663333,19.5,43.7,19.726667,43.933333,18.2,55.79,...,17.0,45.863333,5.0,755.433333,88.0,4.333333,32.666667,3.17,45.518515,45.518515
8424,19.79,36.79,16.89,40.7,20.29,37.4,20.1,35.53,17.76,51.7,...,17.39,39.5,0.6,754.0,87.0,1.0,65.0,-1.4,44.254887,44.254887


In [17]:
Y_train.head()

8738     18.109002
9600      6.333126
16437    17.539477
11869    41.628887
4376      6.133596
Name: rv2, dtype: float64

In [8]:
train_data = pd.concat([X_train, Y_train], axis = 1)
train_data.head()

Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
9969,22.7,37.126667,19.89,40.126667,22.323333,36.663333,20.1,37.09,20.633333,40.93,...,19.463333,36.633333,5.7,761.4,91.0,1.5,42.0,4.35,28.057592,28.057592
12713,21.5,42.09,19.1,45.29,23.0,39.7,20.76,39.0,20.39,48.714,...,20.39,41.163333,7.85,752.933333,85.333333,1.833333,27.333333,5.52,23.07663,23.07663
16652,22.7,36.663333,22.696667,36.026667,23.36,35.79,22.6,35.59,19.5,53.03,...,21.304286,41.981429,11.9,755.1,75.666667,1.0,40.0,7.63,13.365055,13.365055
19262,24.0,40.7,22.426667,41.723333,25.0,39.0,23.39,41.29,23.5,69.4,...,22.79,40.723333,11.0,759.533333,78.333333,3.666667,36.333333,7.3,23.40347,23.40347
7398,21.166667,37.4,19.29,37.56,20.89,38.663333,19.79,36.23,18.26,49.5,...,18.2,40.59,2.9,743.9,90.0,6.0,23.0,1.4,13.254269,13.254269


In [6]:
RF_preds

[array([40.72336055,  9.97087157, 46.92140952, ..., 46.92140952,
        34.3999332 , 34.32557328]),
 array([46.91853058, 28.16226463, 46.91853058, ..., 40.74456152,
        15.5275558 , 40.74456152])]

In [9]:
a = pd.DataFrame(RF_preds).T
a.head()

Unnamed: 0,0,1
0,40.723361,46.918531
1,9.970872,28.162265
2,46.92141,46.918531
3,40.723361,28.162265
4,9.324744,9.260379


In [12]:
a.apply(np.mean, axis = 1)

0       43.820946
1       19.066568
2       46.919970
3       34.442813
4        9.292562
          ...    
3942     9.296962
3943    37.590856
3944    43.832986
3945    24.963745
3946    37.535067
Length: 3947, dtype: float64

In [21]:
X.head()

Unnamed: 0,T1,RH_1,T2,RH_2,T3,RH_3,T4,RH_4,T5,RH_5,...,T9,RH_9,T_out,Press_mm_hg,RH_out,Windspeed,Visibility,Tdewpoint,rv1,rv2
0,19.89,47.596667,19.2,44.79,19.79,44.73,19.0,45.566667,17.166667,55.2,...,17.033333,45.53,6.6,733.5,92.0,7.0,63.0,5.3,13.275433,13.275433
1,19.89,46.693333,19.2,44.7225,19.79,44.79,19.0,45.9925,17.166667,55.2,...,17.066667,45.56,6.48,733.6,92.0,6.666667,59.166667,5.2,18.606195,18.606195
2,19.89,46.3,19.2,44.626667,19.79,44.933333,18.926667,45.89,17.166667,55.09,...,17.0,45.5,6.37,733.7,92.0,6.333333,55.333333,5.1,28.642668,28.642668
3,19.89,46.066667,19.2,44.59,19.79,45.0,18.89,45.723333,17.166667,55.09,...,17.0,45.4,6.25,733.8,92.0,6.0,51.5,5.0,45.41039,45.41039
4,19.89,46.333333,19.2,44.53,19.79,45.0,18.89,45.53,17.2,55.09,...,17.0,45.4,6.13,733.9,92.0,5.666667,47.666667,4.9,10.084097,10.084097
