In [124]:
import pandas as pd
import pandas_profiling

from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
import xgboost as xgb
from sklearn.ensemble import VotingClassifier,StackingClassifier

from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [125]:
data = pd.read_csv('glass.csv')

In [126]:
data.shape

(214, 10)

In [127]:
data['Type'].value_counts()


2    76
1    70
7    29
3    17
5    13
6     9
Name: Type, dtype: int64

In [128]:
data.isna().sum()

RI      0
Na      0
Mg      0
Al      0
Si      0
K       0
Ca      0
Ba      0
Fe      0
Type    0
dtype: int64

In [129]:
#pandas_profiling.ProfileReport(data)

In [130]:
data.drop_duplicates(inplace = True)

In [131]:
data

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.52101,13.64,4.49,1.10,71.78,0.06,8.75,0.00,0.0,1
1,1.51761,13.89,3.60,1.36,72.73,0.48,7.83,0.00,0.0,1
2,1.51618,13.53,3.55,1.54,72.99,0.39,7.78,0.00,0.0,1
3,1.51766,13.21,3.69,1.29,72.61,0.57,8.22,0.00,0.0,1
4,1.51742,13.27,3.62,1.24,73.08,0.55,8.07,0.00,0.0,1
...,...,...,...,...,...,...,...,...,...,...
209,1.51623,14.14,0.00,2.88,72.61,0.08,9.18,1.06,0.0,7
210,1.51685,14.92,0.00,1.99,73.06,0.00,8.40,1.59,0.0,7
211,1.52065,14.36,0.00,2.02,73.42,0.00,8.44,1.64,0.0,7
212,1.51651,14.38,0.00,1.94,73.61,0.00,8.48,1.57,0.0,7


In [132]:
sample = RandomOverSampler()
X,y = sample.fit_resample(data.drop(columns=['Type']), data['Type'])

In [133]:
data_sample = pd.concat([X,y], axis = 1)

In [134]:
data_sample['Type'].value_counts()

1    76
2    76
3    76
5    76
6    76
7    76
Name: Type, dtype: int64

In [135]:
scale = StandardScaler()
X_scale = pd.DataFrame(scale.fit_transform(X), columns=X.columns)

In [136]:
data_sample_scale = pd.concat([X_scale,y], axis = 1)

In [137]:
data_sample_scale

Unnamed: 0,RI,Na,Mg,Al,Si,K,Ca,Ba,Fe,Type
0,1.005995,-0.037941,1.521530,-0.763869,-0.969247,-0.481384,-0.212195,-0.414563,-0.457655,1
1,-0.159261,0.212652,0.947370,-0.310299,0.036400,-0.011003,-0.841935,-0.414563,-0.457655,1
2,-0.649353,-0.148202,0.915114,0.003711,0.311630,-0.111799,-0.876160,-0.414563,-0.457655,1
3,-0.142124,-0.468960,1.005431,-0.432414,-0.090629,0.089793,-0.574980,-0.414563,-0.457655,1
4,-0.224378,-0.408818,0.960273,-0.519639,0.406902,0.067394,-0.677655,-0.414563,-0.457655,1
...,...,...,...,...,...,...,...,...,...,...
451,-0.303204,1.074692,-1.375076,0.806181,0.343387,-0.548582,-0.362785,2.393498,0.311712,7
452,-0.426584,0.884241,-1.375076,0.771291,0.629203,-0.548582,-0.369630,2.358176,0.215541,7
453,-0.652781,1.275166,-1.375076,1.277195,0.639788,-0.548582,-0.239575,0.768708,-0.457655,7
454,-0.258650,1.275166,-1.375076,0.457281,0.311630,-0.548582,-0.308025,2.322855,-0.457655,7


In [138]:
X = data_sample_scale.drop(columns=['Type'])
y = data_sample_scale['Type']

In [139]:
y.value_counts()

1    76
2    76
3    76
5    76
6    76
7    76
Name: Type, dtype: int64

In [140]:
y = y.replace({1:0,2:1,3:2,5:3,6:4,7:5},)

In [141]:
train_X, test_X, train_y, test_y = train_test_split(X,y, test_size=0.2,random_state=21)

In [142]:
model_ = {'Logistic': LogisticRegression(),
         'KNN': KNeighborsClassifier(),
         'Naive Bayes': GaussianNB(),
         'Decision Tree': DecisionTreeClassifier(),
         'RandomForest': RandomForestClassifier(),
         'AdaBoost': AdaBoostClassifier(),
         'GBM': GradientBoostingClassifier(),
         'Xgboost': xgb.XGBClassifier(),
         'Voting_hard': VotingClassifier(estimators=[('lr', LogisticRegression()), 
                                                ('rf', RandomForestClassifier()), 
                                                ('gmb', GradientBoostingClassifier())], 
                                    voting='hard'),
         'Voting_soft': VotingClassifier(estimators=[('lr', LogisticRegression()), 
                                                ('rf', RandomForestClassifier()), 
                                                ('gmb', GradientBoostingClassifier())], 
                                    voting='soft'),
         'Voting_soft_best_model': VotingClassifier(estimators=[('xgb', xgb.XGBClassifier()), 
                                                ('rf', RandomForestClassifier()), 
                                                ('gmb', GradientBoostingClassifier())], 
                                    voting='soft'),
         'Stacking': StackingClassifier(estimators=[('lr', LogisticRegression()), 
                                                ('rf', RandomForestClassifier()), 
                                                ('gmb', GradientBoostingClassifier())], 
                                     final_estimator=LogisticRegression()),
         'Stacking_best_model': StackingClassifier(estimators=[('xgb', xgb.XGBClassifier()),  
                                                ('rf', RandomForestClassifier()), 
                                                ('gmb', GradientBoostingClassifier())], 
                                     final_estimator=DecisionTreeClassifier()),
         'Stacking_best_model_best_final_estimator': StackingClassifier(estimators=[('xgb', xgb.XGBClassifier()),  
                                                ('rf', RandomForestClassifier()), 
                                                ('gmb', GradientBoostingClassifier())], 
                                     final_estimator=RandomForestClassifier()),   
          
          
        }

In [143]:
res = []
for name, model in model_.items():
    print("Model : ",name)
    model.fit(train_X,train_y)
    train_pred = model.predict(train_X)
    test_pred = model.predict(test_X)
    print("Train Accuracy : ",accuracy_score(train_y, train_pred))
    print("Test Accuracy : ",accuracy_score(test_y, test_pred))
    print("Train Confusion Matrix : ",confusion_matrix(train_y, train_pred))
    print("Test Confusion Matrix : ",confusion_matrix(test_y, test_pred))
    print("Train Classification Report : ",classification_report(train_y, train_pred))
    print("Test Classification Report : ",classification_report(test_y, test_pred))
    res.append([name,accuracy_score(train_y, train_pred), accuracy_score(test_y, test_pred)])
    

    
    
    

Model :  Logistic
Train Accuracy :  0.8461538461538461
Test Accuracy :  0.8043478260869565
Train Confusion Matrix :  [[36 12  8  0  0  0]
 [11 29 15  2  2  1]
 [ 0  1 63  0  0  0]
 [ 0  0  0 60  0  0]
 [ 0  0  0  0 61  0]
 [ 2  2  0  0  0 59]]
Test Confusion Matrix :  [[13  5  2  0  0  0]
 [ 3  7  4  2  0  0]
 [ 0  1 11  0  0  0]
 [ 0  0  0 16  0  0]
 [ 0  0  0  0 15  0]
 [ 0  0  0  1  0 12]]
Train Classification Report :                precision    recall  f1-score   support

           0       0.73      0.64      0.69        56
           1       0.66      0.48      0.56        60
           2       0.73      0.98      0.84        64
           3       0.97      1.00      0.98        60
           4       0.97      1.00      0.98        61
           5       0.98      0.94      0.96        63

    accuracy                           0.85       364
   macro avg       0.84      0.84      0.84       364
weighted avg       0.84      0.85      0.84       364

Test Classification Report :  

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train Accuracy :  1.0
Test Accuracy :  0.9021739130434783
Train Confusion Matrix :  [[56  0  0  0  0  0]
 [ 0 60  0  0  0  0]
 [ 0  0 64  0  0  0]
 [ 0  0  0 60  0  0]
 [ 0  0  0  0 61  0]
 [ 0  0  0  0  0 63]]
Test Confusion Matrix :  [[17  1  2  0  0  0]
 [ 4 11  1  0  0  0]
 [ 0  0 12  0  0  0]
 [ 0  0  0 16  0  0]
 [ 0  0  0  0 15  0]
 [ 0  1  0  0  0 12]]
Train Classification Report :                precision    recall  f1-score   support

           0       1.00      1.00      1.00        56
           1       1.00      1.00      1.00        60
           2       1.00      1.00      1.00        64
           3       1.00      1.00      1.00        60
           4       1.00      1.00      1.00        61
           5       1.00      1.00      1.00        63

    accuracy                           1.00       364
   macro avg       1.00      1.00      1.00       364
weighted avg       1.00      1.00      1.00       364

Test Classification Report :                precision    recall

Train Accuracy :  0.9862637362637363
Test Accuracy :  0.9239130434782609
Train Confusion Matrix :  [[56  0  0  0  0  0]
 [ 4 56  0  0  0  0]
 [ 0  1 63  0  0  0]
 [ 0  0  0 60  0  0]
 [ 0  0  0  0 61  0]
 [ 0  0  0  0  0 63]]
Test Confusion Matrix :  [[18  1  1  0  0  0]
 [ 3 13  0  0  0  0]
 [ 0  1 11  0  0  0]
 [ 0  0  0 16  0  0]
 [ 0  0  0  0 15  0]
 [ 0  1  0  0  0 12]]
Train Classification Report :                precision    recall  f1-score   support

           0       0.93      1.00      0.97        56
           1       0.98      0.93      0.96        60
           2       1.00      0.98      0.99        64
           3       1.00      1.00      1.00        60
           4       1.00      1.00      1.00        61
           5       1.00      1.00      1.00        63

    accuracy                           0.99       364
   macro avg       0.99      0.99      0.99       364
weighted avg       0.99      0.99      0.99       364

Test Classification Report :                prec

In [144]:
res_ = pd.DataFrame(res, columns=['Model','Train Accuracy','Test Accuracy'])

In [145]:
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)

In [146]:
train_X, test_X, train_y, test_y = train_test_split(X_poly,y, test_size=0.2,random_state=21)

# With Polynomial Features

In [151]:
res_poly = []
for name, model in model_.items():
    print("Model : ",name)
    model.fit(train_X,train_y)
    train_pred = model.predict(train_X)
    test_pred = model.predict(test_X)
    print("Train Accuracy : ",accuracy_score(train_y, train_pred))
    print("Test Accuracy : ",accuracy_score(test_y, test_pred))
    print("Train Confusion Matrix : ",confusion_matrix(train_y, train_pred))
    print("Test Confusion Matrix : ",confusion_matrix(test_y, test_pred))
    print("Train Classification Report : ",classification_report(train_y, train_pred))
    print("Test Classification Report : ",classification_report(test_y, test_pred))
    res_poly.append([name+'_polynomial_features',accuracy_score(train_y, train_pred), accuracy_score(test_y, test_pred)])
    

    
    
    

Model :  Logistic
Train Accuracy :  0.9285714285714286
Test Accuracy :  0.8586956521739131
Train Confusion Matrix :  [[44  8  4  0  0  0]
 [ 9 46  4  1  0  0]
 [ 0  0 64  0  0  0]
 [ 0  0  0 60  0  0]
 [ 0  0  0  0 61  0]
 [ 0  0  0  0  0 63]]
Test Confusion Matrix :  [[17  0  3  0  0  0]
 [ 7  7  1  1  0  0]
 [ 0  0 12  0  0  0]
 [ 0  0  0 16  0  0]
 [ 0  0  0  0 15  0]
 [ 0  0  0  1  0 12]]
Train Classification Report :                precision    recall  f1-score   support

           0       0.83      0.79      0.81        56
           1       0.85      0.77      0.81        60
           2       0.89      1.00      0.94        64
           3       0.98      1.00      0.99        60
           4       1.00      1.00      1.00        61
           5       1.00      1.00      1.00        63

    accuracy                           0.93       364
   macro avg       0.93      0.93      0.92       364
weighted avg       0.93      0.93      0.93       364

Test Classification Report :  

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Accuracy :  0.4835164835164835
Test Accuracy :  0.45652173913043476
Train Confusion Matrix :  [[ 3 51  2  0  0  0]
 [ 0 49  1  0  9  1]
 [ 1 57  6  0  0  0]
 [ 0  0  0  0 54  6]
 [ 0  0  0  0 61  0]
 [ 0  2  0  0  4 57]]
Test Confusion Matrix :  [[ 2 18  0  0  0  0]
 [ 0 13  0  0  3  0]
 [ 1 11  0  0  0  0]
 [ 0  0  0  0 16  0]
 [ 0  0  0  0 15  0]
 [ 0  0  0  0  1 12]]
Train Classification Report :                precision    recall  f1-score   support

           0       0.75      0.05      0.10        56
           1       0.31      0.82      0.45        60
           2       0.67      0.09      0.16        64
           3       0.00      0.00      0.00        60
           4       0.48      1.00      0.65        61
           5       0.89      0.90      0.90        63

    accuracy                           0.48       364
   macro avg       0.52      0.48      0.38       364
weighted avg       0.52      0.48      0.38       364

Test Classification Report :                pre

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Train Accuracy :  1.0
Test Accuracy :  0.9021739130434783
Train Confusion Matrix :  [[56  0  0  0  0  0]
 [ 0 60  0  0  0  0]
 [ 0  0 64  0  0  0]
 [ 0  0  0 60  0  0]
 [ 0  0  0  0 61  0]
 [ 0  0  0  0  0 63]]
Test Confusion Matrix :  [[18  1  1  0  0  0]
 [ 4 10  1  1  0  0]
 [ 0  0 12  0  0  0]
 [ 0  0  0 16  0  0]
 [ 0  0  0  0 15  0]
 [ 0  0  1  0  0 12]]
Train Classification Report :                precision    recall  f1-score   support

           0       1.00      1.00      1.00        56
           1       1.00      1.00      1.00        60
           2       1.00      1.00      1.00        64
           3       1.00      1.00      1.00        60
           4       1.00      1.00      1.00        61
           5       1.00      1.00      1.00        63

    accuracy                           1.00       364
   macro avg       1.00      1.00      1.00       364
weighted avg       1.00      1.00      1.00       364

Test Classification Report :                precision    recall

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Accuracy :  1.0
Test Accuracy :  0.9130434782608695
Train Confusion Matrix :  [[56  0  0  0  0  0]
 [ 0 60  0  0  0  0]
 [ 0  0 64  0  0  0]
 [ 0  0  0 60  0  0]
 [ 0  0  0  0 61  0]
 [ 0  0  0  0  0 63]]
Test Confusion Matrix :  [[19  0  1  0  0  0]
 [ 5 10  0  1  0  0]
 [ 0  0 12  0  0  0]
 [ 0  0  0 16  0  0]
 [ 0  0  0  0 15  0]
 [ 0  0  0  1  0 12]]
Train Classification Report :                precision    recall  f1-score   support

           0       1.00      1.00      1.00        56
           1       1.00      1.00      1.00        60
           2       1.00      1.00      1.00        64
           3       1.00      1.00      1.00        60
           4       1.00      1.00      1.00        61
           5       1.00      1.00      1.00        63

    accuracy                           1.00       364
   macro avg       1.00      1.00      1.00       364
weighted avg       1.00      1.00      1.00       364

Test Classification Report :                precision    recall

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Train Accuracy :  1.0
Test Accuracy :  0.9021739130434783
Train Confusion Matrix :  [[56  0  0  0  0  0]
 [ 0 60  0  0  0  0]
 [ 0  0 64  0  0  0]
 [ 0  0  0 60  0  0]
 [ 0  0  0  0 61  0]
 [ 0  0  0  0  0 63]]
Test Confusion Matrix :  [[19  0  1  0  0  0]
 [ 6  9  0  1  0  0]
 [ 0  0 12  0  0  0]
 [ 0  0  0 16  0  0]
 [ 0  0  0  0 15  0]
 [ 0  0  0  1  0 12]]
Train Classification Report :                precision    recall  f1-score   support

           0       1.00      1.00      1.00        56
           1       1.00      1.00      1.00        60
           2       1.00      1.00      1.00        64
           3       1.00      1.00      1.00        60
           4       1.00      1.00      1.00        61
           5       1.00      1.00      1.00        63

    accuracy                           1.00       364
   macro avg       1.00      1.00      1.00       364
weighted avg       1.00      1.00      1.00       364

Test Classification Report :                precision    recall

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Train Accuracy :  1.0
Test Accuracy :  0.9239130434782609
Train Confusion Matrix :  [[56  0  0  0  0  0]
 [ 0 60  0  0  0  0]
 [ 0  0 64  0  0  0]
 [ 0  0  0 60  0  0]
 [ 0  0  0  0 61  0]
 [ 0  0  0  0  0 63]]
Test Confusion Matrix :  [[18  1  1  0  0  0]
 [ 3 12  0  1  0  0]
 [ 0  0 12  0  0  0]
 [ 0  0  0 16  0  0]
 [ 0  0  0  0 15  0]
 [ 0  0  0  1  0 12]]
Train Classification Report :                precision    recall  f1-score   support

           0       1.00      1.00      1.00        56
           1       1.00      1.00      1.00        60
           2       1.00      1.00      1.00        64
           3       1.00      1.00      1.00        60
           4       1.00      1.00      1.00        61
           5       1.00      1.00      1.00        63

    accuracy                           1.00       364
   macro avg       1.00      1.00      1.00       364
weighted avg       1.00      1.00      1.00       364

Test Classification Report :                precision    recall

In [152]:
poly_res = pd.DataFrame(res, columns=['Model','Train Accuracy','Test Accuracy'])

In [154]:
pd.concat([res_,poly_res])

Unnamed: 0,Model,Train Accuracy,Test Accuracy
0,Logistic,0.846154,0.804348
1,KNN,0.895604,0.847826
2,Naive Bayes,0.673077,0.608696
3,Decision Tree,1.0,0.869565
4,RandomForest,1.0,0.891304
5,AdaBoost,0.601648,0.532609
6,GBM,1.0,0.902174
7,Xgboost,1.0,0.880435
8,Voting_hard,1.0,0.869565
9,Voting_soft,1.0,0.891304


In [None]:
BigMart Sales - DecisionTree, Random forest, Boosting, stacking voting
Loan Prediction - 
Flight Prediction - 
Cross Sell prediction
Black Friday Prediction
Titanic Survival prediction
Housing Prediction - 

