In [10]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)

## Regression

In [11]:
dataset=pd.read_csv('Datasets/outlier_handled_target')
dataset.drop(['Unnamed: 0','Date'],axis=1,inplace=True)

In [12]:
dataset.head()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity,Total_power_consumed
0,1209.176,50.212,341733.055,5180.8,14680.933333
1,2985.47,226.006,345725.32,12375.6,27354.983333
2,2203.826,161.792,347373.64,9247.2,19028.433333
3,1666.194,150.942,348479.01,7094.0,13131.9
4,2225.748,160.998,348923.61,9313.0,20384.8


In [13]:
## Dependent and independent features

X=dataset.iloc[:,:4]
y=dataset.loc[:,'Total_power_consumed']

In [14]:
X.head()

Unnamed: 0,Global_active_power,Global_reactive_power,Voltage,Global_intensity
0,1209.176,50.212,341733.055,5180.8
1,2985.47,226.006,345725.32,12375.6
2,2203.826,161.792,347373.64,9247.2
3,1666.194,150.942,348479.01,7094.0
4,2225.748,160.998,348923.61,9313.0


In [15]:
y.head()

0    14680.933333
1    27354.983333
2    19028.433333
3    13131.900000
4    20384.800000
Name: Total_power_consumed, dtype: float64

In [16]:
## Splitting train and test data

from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=42)

In [17]:
## Applying standard Scaler
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled=scaler.fit_transform(X_train)
X_test_scaled=scaler.transform(X_test)


### RandomForestRegressor

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
rfr=RandomForestRegressor()

rfr.fit(X_train,y_train)
x_predicted=rfr.predict(X_train)
score=r2_score(y_train,x_predicted)
print(f"Training r2 score {score}")
print(f"Training Adjusted r2 score {1 - (1-score)*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)}")
y_pred = rfr.predict(X_test)
score=r2_score(y_test,y_pred)
print(f"Testing r2 score {score}")
print(f"Testing Adjusted r2 score {1 - (1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)}")

## MAE MSE AND RMSE

print(f"Mean squared error {mean_squared_error(y_test,y_pred)}")
print(f"Mean absolute error {mean_absolute_error(y_test,y_pred)}")
print(f"Root Mean squared error {np.sqrt(mean_squared_error(y_test,y_pred))}")

Training r2 score 0.9742143779154343
Training Adjusted r2 score 0.9741178928936024
Testing r2 score 0.8204401203700327
Testing Adjusted r2 score 0.8184111951764736
Mean squared error 5250267.134886854
Mean absolute error 1710.2045171773436
Root Mean squared error 2291.3461403478204


#### Bagging Regressor

In [19]:
from sklearn.ensemble import BaggingRegressor
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
bgr = BaggingRegressor() # base_estimator is decision tree regressor
bgr.fit(X_train,y_train)
x_predicted=bgr.predict(X_train)
score=r2_score(y_train,x_predicted)
print(f"Training r2 score {score}")
print(f"Training Adjusted r2 score {1 - (1-score)*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)}")
y_pred = bgr.predict(X_test)
score=r2_score(y_test,y_pred)
print(f"Testing r2 score {score}")
print(f"Testing Adjusted r2 score {1 - (1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)}")

## MAE MSE AND RMSE

print(f"Mean squared error {mean_squared_error(y_test,y_pred)}")
print(f"Mean absolute error {mean_absolute_error(y_test,y_pred)}")
print(f"Root Mean squared error {np.sqrt(mean_squared_error(y_test,y_pred))}")

Training r2 score 0.9635738512398112
Training Adjusted r2 score 0.9634375513379957
Testing r2 score 0.8084728160876928
Testing Adjusted r2 score 0.8063086671169323
Mean squared error 5600186.86359374
Mean absolute error 1767.3294661095636
Root Mean squared error 2366.471395050813


### ExtraTreeRegressor

In [20]:
from sklearn.ensemble import ExtraTreesRegressor
extr=ExtraTreesRegressor()

extr.fit(X_train,y_train)
x_predicted=extr.predict(X_train)
score=r2_score(y_train,x_predicted)
print(f"Training r2 score {score}")
print(f"Training Adjusted r2 score {1 - (1-score)*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)}")
y_pred = extr.predict(X_test)
score=r2_score(y_test,y_pred)
print(f"Testing r2 score {score}")
print(f"Testing Adjusted r2 score {1 - (1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)}")

## MAE MSE AND RMSE

print(f"Mean squared error {mean_squared_error(y_test,y_pred)}")
print(f"Mean absolute error {mean_absolute_error(y_test,y_pred)}")
print(f"Root Mean squared error {np.sqrt(mean_squared_error(y_test,y_pred))}")

Training r2 score 0.9999993014176962
Training Adjusted r2 score 0.9999992988037306
Testing r2 score 0.8178388220387196
Testing Adjusted r2 score 0.8157805036436769
Mean squared error 5326328.174608386
Mean absolute error 1752.368388115134
Root Mean squared error 2307.883917056572


### VotingRegressor


In [21]:
from sklearn.ensemble import VotingRegressor,RandomForestRegressor,BaggingRegressor,ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

lr=LinearRegression()
rf=RandomForestRegressor()
bgr1=BaggingRegressor()
extr=ExtraTreesRegressor()
dtr=DecisionTreeRegressor()
vr=VotingRegressor([('lr',lr),('rf',rf),('bgr',bgr1),('extr',extr),('dtr',dtr)])

vr.fit(X_train,y_train)
x_predicted=vr.predict(X_train)
score=r2_score(y_train,x_predicted)
print(f"Training r2 score {score}")
print(f"Training Adjusted r2 score {1 - (1-score)*(len(y_train)-1)/(len(y_train)-X_train.shape[1]-1)}")
y_pred = vr.predict(X_test)
score=r2_score(y_test,y_pred)
print(f"Testing r2 score {score}")
print(f"Testing Adjusted r2 score {1 - (1-score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1)}")

## MAE MSE AND RMSE

print(f"Mean squared error {mean_squared_error(y_test,y_pred)}")
print(f"Mean absolute error {mean_absolute_error(y_test,y_pred)}")
print(f"Root Mean squared error {np.sqrt(mean_squared_error(y_test,y_pred))}")

Training r2 score 0.9785199522796529
Training Adjusted r2 score 0.9784395779196142
Testing r2 score 0.8261929805612122
Testing Adjusted r2 score 0.82422905943761
Mean squared error 5082055.545217754
Mean absolute error 1682.3391269873175
Root Mean squared error 2254.3414881551894


## Classification

In [22]:
dataset=pd.read_csv('Datasets/scaled_csv')
dataset.head()

Unnamed: 0,age,workclass,fnlwgt,education_num,marital_status,occupation,relationship,race,sex,hours_per_week,income,capital_income,native_countryUnited-States,native_countryMexico,native_countryPhilippines,native_countryGermany,native_countryPuerto-Rico,native_countryCanada,native_countryEl-Salvador,native_countryIndia,native_countryCuba,native_countryEngland
0,0.169493,2,-1.205842,13.0,0,1,0,0,0,-0.085137,0,0.401903,1,0,0,0,0,0,0,0,0,0
1,0.887249,6,-1.11967,13.0,1,2,1,0,0,-1.587107,0,0.046414,1,0,0,0,0,0,0,0,0,0
2,0.096429,7,0.405653,9.0,2,3,0,0,0,-0.085137,0,0.046414,1,0,0,0,0,0,0,0,0,0
3,1.06058,7,0.584575,7.0,1,3,1,1,0,-0.085137,0,0.046414,1,0,0,0,0,0,0,0,0,0
4,-0.735021,7,1.460655,13.0,1,4,2,1,1,-0.085137,0,0.046414,0,0,0,0,0,0,0,0,1,0


In [23]:
y=dataset['income']
X=dataset.drop('income',axis=1)

In [24]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.33,random_state=42)

### RandomForestClassifier

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score
rfc=RandomForestClassifier()
rfc.fit(X_train,y_train)
x_predicted=rfc.predict(X_train)
score=accuracy_score(y_train,x_predicted)
print(f"Training accuracy score {score}")

y_pred = rfc.predict(X_test)
score=accuracy_score(y_test,y_pred)
print(f"Testing accuracy score {score}")
conf_mat=confusion_matrix(y_test,y_pred)
print('Confusion matrix \n',conf_mat)
print('Precision : ', precision_score(y_test,y_pred))
print('Recall : ',recall_score(y_test,y_pred))



Training accuracy score 0.9998471135029354
Testing accuracy score 0.8560432056614315
Confusion matrix 
 [[11389   815]
 [ 1504  2401]]
Precision :  0.7465796019900498
Recall :  0.6148527528809219


### Bagging Classifier

In [26]:
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score
bgc=BaggingClassifier()
bgc.fit(X_train,y_train)
x_predicted=bgc.predict(X_train)
score=accuracy_score(y_train,x_predicted)
print(f"Training accuracy score {score}")

y_pred = bgc.predict(X_test)
score=accuracy_score(y_test,y_pred)
print(f"Testing accuracy score {score}")
conf_mat=confusion_matrix(y_test,y_pred)
print('Confusion matrix \n',conf_mat)
print('Precision : ', precision_score(y_test,y_pred))
print('Recall : ',recall_score(y_test,y_pred))



Training accuracy score 0.988380626223092
Testing accuracy score 0.8487181078899994
Confusion matrix 
 [[11368   836]
 [ 1601  2304]]
Precision :  0.7337579617834394
Recall :  0.5900128040973112


### ExtraTree Classifier

In [27]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score
etc=ExtraTreesClassifier()
etc.fit(X_train,y_train)
x_predicted=etc.predict(X_train)
score=accuracy_score(y_train,x_predicted)
print(f"Training accuracy score {score}")

y_pred = etc.predict(X_test)
score=accuracy_score(y_test,y_pred)
print(f"Testing accuracy score {score}")
conf_mat=confusion_matrix(y_test,y_pred)
print('Confusion matrix \n',conf_mat)
print('Precision : ', precision_score(y_test,y_pred))
print('Recall : ',recall_score(y_test,y_pred))


Training accuracy score 0.9998776908023483
Testing accuracy score 0.8383512322304302
Confusion matrix 
 [[11193  1011]
 [ 1593  2312]]
Precision :  0.6957568462232923
Recall :  0.5920614596670934


### VotingClassifier

In [28]:
from sklearn.ensemble import ExtraTreesClassifier,RandomForestClassifier,VotingClassifier,BaggingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
logr=LogisticRegression()
svc=SVC()
extc=ExtraTreesClassifier()
bgc=BaggingClassifier()
rfc=RandomForestClassifier()
dtc=DecisionTreeClassifier()

from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score
vtc=VotingClassifier([('logreg',logr),('svc',svc),('extc',extc),('bgc',bgc),('rfc',rfc),('dtc',dtc)])
vtc.fit(X_train,y_train)
x_predicted=vtc.predict(X_train)
score=accuracy_score(y_train,x_predicted)
print(f"Training accuracy score {score}")

y_pred = vtc.predict(X_test)
score=accuracy_score(y_test,y_pred)
print(f"Testing accuracy score {score}")
conf_mat=confusion_matrix(y_test,y_pred)
print('Confusion matrix \n',conf_mat)
print('Precision : ', precision_score(y_test,y_pred))
print('Recall : ',recall_score(y_test,y_pred))

Training accuracy score 0.990154109589041
Testing accuracy score 0.8480973368924204
Confusion matrix 
 [[11686   518]
 [ 1929  1976]]
Precision :  0.7923015236567763
Recall :  0.5060179257362356
