In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report

In [2]:
# Creating Random Forest classifier
rf_clf = RandomForestClassifier(n_estimators=10, random_state=42)

In [3]:
# Creating Bagging classifier using Decision Tree base estimator
bagging_dt_clf = BaggingClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42)

In [4]:
# Creating Bagging classifier using Naive Bayes base estimator
bagging_nb_clf = BaggingClassifier(base_estimator=GaussianNB(), n_estimators=10, random_state=42)

In [5]:
# Creating AdaBoost classifier using Decision Tree base estimator
adaboost_dt_clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), n_estimators=10, random_state=42)

In [6]:
# Creating AdaBoost classifier using Naive Bayes base estimator
adaboost_nb_clf = AdaBoostClassifier(base_estimator=GaussianNB(), n_estimators=10, random_state=42)

# Applying Models on Iris Dataset

In [7]:
from sklearn.datasets import load_iris
iris = load_iris()
iris_data = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_data['target'] = iris.target
iris_data.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [8]:
X = iris_data.drop(columns = ['target'])
y = iris_data['target']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(120, 4)
(30, 4)
(120,)
(30,)


In [9]:
rf_clf.fit(X_train,y_train)

In [10]:
y_predicted = rf_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [11]:
bagging_dt_clf.fit(X_train,y_train)



In [12]:
y_predicted = bagging_dt_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [13]:
bagging_nb_clf.fit(X_train,y_train)



In [14]:
y_predicted = bagging_nb_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [16]:
adaboost_dt_clf.fit(X_train,y_train)



In [17]:
y_predicted = adaboost_dt_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [18]:
adaboost_nb_clf.fit(X_train,y_train)



In [19]:
y_predicted = adaboost_nb_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       0.00      0.00      0.00         9
           2       0.55      1.00      0.71        11

    accuracy                           0.70        30
   macro avg       0.52      0.67      0.57        30
weighted avg       0.54      0.70      0.59        30



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Applying Models on Wine Recognition dataset

In [20]:
from sklearn.datasets import load_wine
wine = load_wine()

wine_df = pd.DataFrame(data=wine.data, columns=wine.feature_names)
wine_df['target'] = wine.target

wine_df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline,target
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0,0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0,0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0,0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0,0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0,0


In [21]:
X = wine_df.drop(columns = ['target'])
y = wine_df['target']
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(142, 13)
(36, 13)
(142,)
(36,)


In [22]:
rf_clf.fit(X_train,y_train)

In [23]:
y_predicted = rf_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.93      1.00      0.97        14
           1       0.93      0.93      0.93        14
           2       1.00      0.88      0.93         8

    accuracy                           0.94        36
   macro avg       0.95      0.93      0.94        36
weighted avg       0.95      0.94      0.94        36



In [24]:
bagging_dt_clf.fit(X_train,y_train)



In [25]:
y_predicted = bagging_dt_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.93      1.00      0.97        14
           2       1.00      0.88      0.93         8

    accuracy                           0.97        36
   macro avg       0.98      0.96      0.97        36
weighted avg       0.97      0.97      0.97        36



In [26]:
bagging_nb_clf.fit(X_train,y_train)



In [27]:
y_predicted = bagging_nb_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00         8

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



In [28]:
adaboost_dt_clf.fit(X_train,y_train)



In [29]:
y_predicted = adaboost_dt_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       1.00      0.93      0.96        14
           1       0.88      1.00      0.93        14
           2       1.00      0.88      0.93         8

    accuracy                           0.94        36
   macro avg       0.96      0.93      0.94        36
weighted avg       0.95      0.94      0.94        36



In [30]:
adaboost_nb_clf.fit(X_train,y_train)



In [31]:
y_predicted = adaboost_nb_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.93      0.93      0.93        14
           1       0.92      0.86      0.89        14
           2       0.89      1.00      0.94         8

    accuracy                           0.92        36
   macro avg       0.91      0.93      0.92        36
weighted avg       0.92      0.92      0.92        36



# Applying Models on Breast Cancer Wisconsin (diagnostic) dataset

In [32]:
from sklearn.datasets import load_breast_cancer

cancer = load_breast_cancer()

cancer_df = pd.DataFrame(data=cancer.data, columns=cancer.feature_names)
cancer_df['target'] = cancer.target
cancer_df.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [33]:
X = cancer_df.drop(columns = ['target'])
y = cancer_df['target']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state=42)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(455, 30)
(114, 30)
(455,)
(114,)


In [34]:
rf_clf.fit(X_train,y_train)

In [35]:
y_predicted = rf_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.95      0.93      0.94        43
           1       0.96      0.97      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [36]:
bagging_dt_clf.fit(X_train,y_train)



In [37]:
y_predicted = bagging_dt_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.95      0.93      0.94        43
           1       0.96      0.97      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [38]:
bagging_nb_clf.fit(X_train,y_train)



In [39]:
y_predicted = bagging_nb_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       1.00      0.91      0.95        43
           1       0.95      1.00      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.95      0.96       114
weighted avg       0.97      0.96      0.96       114



In [40]:
adaboost_dt_clf.fit(X_train,y_train)



In [41]:
y_predicted = adaboost_dt_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.93      0.91      0.92        43
           1       0.94      0.96      0.95        71

    accuracy                           0.94       114
   macro avg       0.94      0.93      0.93       114
weighted avg       0.94      0.94      0.94       114



In [42]:
adaboost_nb_clf.fit(X_train,y_train)



In [43]:
y_predicted = adaboost_nb_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.63      0.98      0.76        43
           1       0.98      0.65      0.78        71

    accuracy                           0.77       114
   macro avg       0.80      0.81      0.77       114
weighted avg       0.85      0.77      0.77       114



# Applying Model on Titanic dataset

In [44]:
titanic_df = pd.read_csv('Data/titanic.csv')
titanic_df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


In [45]:
titanic_df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [46]:
titanic_df.drop(columns = ['PassengerId','Name','Ticket','Cabin'],inplace = True)
titanic_df['Sex'] = titanic_df['Sex'].map({'male':1,'female':0})
titanic_df['Embarked'] = titanic_df['Embarked'].map({'S':0,'C':1,'Q':2})
titanic_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,3,1,22.0,1,0,7.2500,0.0,0
1,1,0,38.0,1,0,71.2833,1.0,1
2,3,0,26.0,0,0,7.9250,0.0,1
3,1,0,35.0,1,0,53.1000,0.0,1
4,3,1,35.0,0,0,8.0500,0.0,0
...,...,...,...,...,...,...,...,...
886,2,1,27.0,0,0,13.0000,0.0,0
887,1,0,19.0,0,0,30.0000,0.0,1
888,3,0,,1,2,23.4500,0.0,0
889,1,1,26.0,0,0,30.0000,1.0,1


In [47]:
titanic_df.isna().sum()

Pclass        0
Sex           0
Age         177
SibSp         0
Parch         0
Fare          0
Embarked      2
Survived      0
dtype: int64

In [48]:
titanic_df.dropna(how = 'any',inplace = True)
titanic_df.rename(columns = {'Survived':'target'},inplace = True)
titanic_df

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,target
0,3,1,22.0,1,0,7.2500,0.0,0
1,1,0,38.0,1,0,71.2833,1.0,1
2,3,0,26.0,0,0,7.9250,0.0,1
3,1,0,35.0,1,0,53.1000,0.0,1
4,3,1,35.0,0,0,8.0500,0.0,0
...,...,...,...,...,...,...,...,...
885,3,0,39.0,0,5,29.1250,2.0,0
886,2,1,27.0,0,0,13.0000,0.0,0
887,1,0,19.0,0,0,30.0000,0.0,1
889,1,1,26.0,0,0,30.0000,1.0,1


In [49]:
X = cancer_df.drop(columns = ['target'])
y = cancer_df['target']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(455, 30)
(114, 30)
(455,)
(114,)


In [50]:
rf_clf.fit(X_train,y_train)

In [51]:
y_predicted = rf_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.95      0.93      0.94        43
           1       0.96      0.97      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [52]:
bagging_dt_clf.fit(X_train,y_train)



In [55]:
y_predicted = bagging_dt_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.95      0.93      0.94        43
           1       0.96      0.97      0.97        71

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [56]:
bagging_nb_clf.fit(X_train,y_train)



In [57]:
y_predicted = bagging_nb_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       1.00      0.91      0.95        43
           1       0.95      1.00      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.95      0.96       114
weighted avg       0.97      0.96      0.96       114



In [58]:
adaboost_dt_clf.fit(X_train,y_train)



In [59]:
y_predicted = adaboost_dt_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.93      0.91      0.92        43
           1       0.94      0.96      0.95        71

    accuracy                           0.94       114
   macro avg       0.94      0.93      0.93       114
weighted avg       0.94      0.94      0.94       114



In [60]:
adaboost_nb_clf.fit(X_train,y_train)



In [61]:
y_predicted = adaboost_nb_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.63      0.98      0.76        43
           1       0.98      0.65      0.78        71

    accuracy                           0.77       114
   macro avg       0.80      0.81      0.77       114
weighted avg       0.85      0.77      0.77       114



# Applying Models on Salary Prediction dataset

In [62]:
salary_df = pd.read_csv('Data/salary.csv')
salary_df.dropna(how = 'any',inplace = True)
salary_df.head()

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Country,Race
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0,UK,White
1,28.0,Female,Master's,Data Analyst,3.0,65000.0,USA,Hispanic
2,45.0,Male,PhD,Senior Manager,15.0,150000.0,Canada,White
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0,USA,Hispanic
4,52.0,Male,Master's,Director,20.0,200000.0,USA,Asian


In [63]:
salary_df['Gender'].value_counts()

Male      3671
Female    3013
Other       14
Name: Gender, dtype: int64

In [64]:
gender = pd.get_dummies(salary_df['Gender'])
salary_df = pd.concat([salary_df,gender], axis = 'columns')
salary_df

Unnamed: 0,Age,Gender,Education Level,Job Title,Years of Experience,Salary,Country,Race,Female,Male,Other
0,32.0,Male,Bachelor's,Software Engineer,5.0,90000.0,UK,White,0,1,0
1,28.0,Female,Master's,Data Analyst,3.0,65000.0,USA,Hispanic,1,0,0
2,45.0,Male,PhD,Senior Manager,15.0,150000.0,Canada,White,0,1,0
3,36.0,Female,Bachelor's,Sales Associate,7.0,60000.0,USA,Hispanic,1,0,0
4,52.0,Male,Master's,Director,20.0,200000.0,USA,Asian,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...
6699,49.0,Female,PhD,Director of Marketing,20.0,200000.0,UK,Mixed,1,0,0
6700,32.0,Male,High School,Sales Associate,3.0,50000.0,Australia,Australian,0,1,0
6701,30.0,Female,Bachelor's Degree,Financial Manager,4.0,55000.0,China,Chinese,1,0,0
6702,46.0,Male,Master's Degree,Marketing Manager,14.0,140000.0,China,Korean,0,1,0


In [65]:
salary_df.drop(columns = ['Gender','Other'],inplace = True)
salary_df

Unnamed: 0,Age,Education Level,Job Title,Years of Experience,Salary,Country,Race,Female,Male
0,32.0,Bachelor's,Software Engineer,5.0,90000.0,UK,White,0,1
1,28.0,Master's,Data Analyst,3.0,65000.0,USA,Hispanic,1,0
2,45.0,PhD,Senior Manager,15.0,150000.0,Canada,White,0,1
3,36.0,Bachelor's,Sales Associate,7.0,60000.0,USA,Hispanic,1,0
4,52.0,Master's,Director,20.0,200000.0,USA,Asian,0,1
...,...,...,...,...,...,...,...,...,...
6699,49.0,PhD,Director of Marketing,20.0,200000.0,UK,Mixed,1,0
6700,32.0,High School,Sales Associate,3.0,50000.0,Australia,Australian,0,1
6701,30.0,Bachelor's Degree,Financial Manager,4.0,55000.0,China,Chinese,1,0
6702,46.0,Master's Degree,Marketing Manager,14.0,140000.0,China,Korean,0,1


In [66]:
salary_df['Education Level'].value_counts()

Bachelor's Degree    2265
Master's Degree      1572
PhD                  1368
Bachelor's            756
High School           448
Master's              288
phD                     1
Name: Education Level, dtype: int64

In [67]:
salary_df['Education Level'].replace({'phD':'PhD',"Bachelor's":"Bachelor's Degree","Master's":"Master's Degree"},inplace = True)
salary_df['Education Level'].value_counts()

Bachelor's Degree    3021
Master's Degree      1860
PhD                  1369
High School           448
Name: Education Level, dtype: int64

In [68]:
salary_df['Education Level'] = salary_df['Education Level'].map({'High School':1,"Bachelor's Degree":2,"Master's Degree":3,"PhD":4})
salary_df

Unnamed: 0,Age,Education Level,Job Title,Years of Experience,Salary,Country,Race,Female,Male
0,32.0,2,Software Engineer,5.0,90000.0,UK,White,0,1
1,28.0,3,Data Analyst,3.0,65000.0,USA,Hispanic,1,0
2,45.0,4,Senior Manager,15.0,150000.0,Canada,White,0,1
3,36.0,2,Sales Associate,7.0,60000.0,USA,Hispanic,1,0
4,52.0,3,Director,20.0,200000.0,USA,Asian,0,1
...,...,...,...,...,...,...,...,...,...
6699,49.0,4,Director of Marketing,20.0,200000.0,UK,Mixed,1,0
6700,32.0,1,Sales Associate,3.0,50000.0,Australia,Australian,0,1
6701,30.0,2,Financial Manager,4.0,55000.0,China,Chinese,1,0
6702,46.0,3,Marketing Manager,14.0,140000.0,China,Korean,0,1


In [69]:
salary_df['Job Title'].value_counts()

Software Engineer             518
Data Scientist                453
Software Engineer Manager     376
Data Analyst                  363
Senior Project Engineer       318
                             ... 
Account Manager                 1
Help Desk Analyst               1
Senior Training Specialist      1
Junior Web Designer             1
Software Project Manager        1
Name: Job Title, Length: 191, dtype: int64

In [70]:
job_titles = pd.get_dummies(salary_df['Job Title'])
job_titles.drop(columns = ['Account Manager'],inplace = True)

In [71]:
salary_df['Country'].value_counts()

USA          1359
China        1343
Australia    1336
UK           1335
Canada       1325
Name: Country, dtype: int64

In [72]:
country = pd.get_dummies(salary_df['Country'])
country.drop(columns = ['Canada'],inplace = True)
country

Unnamed: 0,Australia,China,UK,USA
0,0,0,1,0
1,0,0,0,1
2,0,0,0,0
3,0,0,0,1
4,0,0,0,1
...,...,...,...,...
6699,0,0,1,0
6700,1,0,0,0
6701,0,1,0,0
6702,0,1,0,0


In [73]:
salary_df['Race'].value_counts()

White               1962
Asian               1603
Korean               457
Australian           452
Chinese              444
Black                437
African American     354
Mixed                334
Welsh                333
Hispanic             322
Name: Race, dtype: int64

In [74]:
race = pd.get_dummies(salary_df['Race'])
race.drop(columns = ['White'],inplace = True)
race

Unnamed: 0,African American,Asian,Australian,Black,Chinese,Hispanic,Korean,Mixed,Welsh
0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0
4,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...
6699,0,0,0,0,0,0,0,1,0
6700,0,0,1,0,0,0,0,0,0
6701,0,0,0,0,1,0,0,0,0
6702,0,0,0,0,0,0,1,0,0


In [75]:
salary_df.drop(columns = ['Job Title','Country','Race'],inplace = True)
salary_df = pd.concat([salary_df,job_titles,country,race],axis = 'columns')
salary_df

Unnamed: 0,Age,Education Level,Years of Experience,Salary,Female,Male,Accountant,Administrative Assistant,Back end Developer,Business Analyst,...,USA,African American,Asian,Australian,Black,Chinese,Hispanic,Korean,Mixed,Welsh
0,32.0,2,5.0,90000.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,28.0,3,3.0,65000.0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
2,45.0,4,15.0,150000.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,36.0,2,7.0,60000.0,1,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
4,52.0,3,20.0,200000.0,0,1,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6699,49.0,4,20.0,200000.0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6700,32.0,1,3.0,50000.0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
6701,30.0,2,4.0,55000.0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
6702,46.0,3,14.0,140000.0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [76]:
X = salary_df.drop(columns = 'Salary')
y = salary_df['Salary']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(5358, 208)
(1340, 208)
(5358,)
(1340,)


In [77]:
rf_clf.fit(X_train,y_train)

In [78]:
y_predicted = rf_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

       550.0       0.00      0.00      0.00         1
     25000.0       0.73      0.88      0.80        25
     26000.0       0.25      0.25      0.25         4
     28000.0       1.00      1.00      1.00         3
     30000.0       0.60      0.55      0.57        11
     31000.0       0.00      0.00      0.00         3
     32000.0       0.33      0.50      0.40         2
     33000.0       0.67      1.00      0.80         2
     35000.0       0.78      0.81      0.79        26
     36000.0       1.00      0.40      0.57         5
     37000.0       0.33      1.00      0.50         1
     38000.0       0.00      0.00      0.00         1
     40000.0       0.65      0.69      0.67        29
     41000.0       0.00      0.00      0.00         1
     42000.0       0.50      1.00      0.67         1
     43000.0       0.00      0.00      0.00         2
     45000.0       0.43      0.30      0.35        20
     47000.0       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [79]:
bagging_dt_clf.fit(X_train,y_train)



In [80]:
y_predicted = bagging_dt_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

       350.0       0.00      0.00      0.00         0
       550.0       0.00      0.00      0.00         1
     25000.0       0.77      0.92      0.84        25
     26000.0       0.33      0.25      0.29         4
     28000.0       1.00      1.00      1.00         3
     30000.0       0.71      0.45      0.56        11
     31000.0       0.67      0.67      0.67         3
     32000.0       1.00      1.00      1.00         2
     33000.0       1.00      1.00      1.00         2
     35000.0       0.79      0.88      0.84        26
     36000.0       1.00      0.60      0.75         5
     37000.0       0.33      1.00      0.50         1
     38000.0       1.00      1.00      1.00         1
     40000.0       0.74      0.90      0.81        29
     41000.0       0.00      0.00      0.00         1
     42000.0       0.50      1.00      0.67         1
     43000.0       0.00      0.00      0.00         2
     45000.0       0.92    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [83]:
bagging_nb_clf.fit(X_train,y_train)



In [84]:
y_predicted = bagging_nb_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])
  jointi = np.log(self.class_prior_[i])


              precision    recall  f1-score   support

       550.0       0.00      0.00      0.00         1
     25000.0       0.61      0.76      0.68        25
     26000.0       0.23      0.75      0.35         4
     28000.0       1.00      0.33      0.50         3
     30000.0       0.16      0.27      0.20        11
     31000.0       0.50      0.67      0.57         3
     32000.0       0.00      0.00      0.00         2
     33000.0       1.00      0.50      0.67         2
     35000.0       0.53      0.65      0.59        26
     36000.0       1.00      0.80      0.89         5
     37000.0       1.00      1.00      1.00         1
     38000.0       0.00      0.00      0.00         1
     40000.0       0.40      0.14      0.21        29
     41000.0       0.00      0.00      0.00         1
     42000.0       0.00      0.00      0.00         1
     43000.0       0.00      0.00      0.00         2
     45000.0       0.20      0.15      0.17        20
     48000.0       0.27    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [85]:
adaboost_dt_clf.fit(X_train,y_train)



In [86]:
y_predicted = adaboost_dt_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

       550.0       0.00      0.00      0.00         1
     25000.0       0.77      0.96      0.86        25
     26000.0       0.50      0.25      0.33         4
     28000.0       1.00      1.00      1.00         3
     30000.0       0.50      0.45      0.48        11
     31000.0       0.50      0.33      0.40         3
     32000.0       1.00      0.50      0.67         2
     33000.0       1.00      1.00      1.00         2
     35000.0       0.78      0.96      0.86        26
     36000.0       0.75      0.60      0.67         5
     37000.0       0.00      0.00      0.00         1
     38000.0       1.00      1.00      1.00         1
     40000.0       0.88      0.76      0.81        29
     41000.0       0.00      0.00      0.00         1
     42000.0       0.50      1.00      0.67         1
     43000.0       1.00      1.00      1.00         2
     45000.0       0.81      0.65      0.72        20
     48000.0       1.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [87]:
adaboost_nb_clf.fit(X_train,y_train)



In [88]:
y_predicted = adaboost_nb_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

       550.0       0.00      0.00      0.00         1
     25000.0       0.90      0.36      0.51        25
     26000.0       0.00      0.00      0.00         4
     28000.0       0.00      0.00      0.00         3
     30000.0       0.31      0.73      0.43        11
     31000.0       0.00      0.00      0.00         3
     32000.0       1.00      0.50      0.67         2
     33000.0       0.00      0.00      0.00         2
     35000.0       0.61      0.42      0.50        26
     36000.0       0.00      0.00      0.00         5
     37000.0       0.00      0.00      0.00         1
     38000.0       0.00      0.00      0.00         1
     40000.0       0.36      0.31      0.33        29
     41000.0       0.00      0.00      0.00         1
     42000.0       0.00      0.00      0.00         1
     43000.0       0.00      0.00      0.00         2
     45000.0       0.00      0.00      0.00        20
     48000.0       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Applying Model on Sleep Health and Lifestyle Dataset

In [99]:
sleep_df = pd.read_csv('Data/Sleep_health_and_lifestyle_dataset.csv');
sleep_df.drop(columns = ['Person ID'],inplace = True)
sleep_df

Unnamed: 0,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,
1,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,
3,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,Sleep Apnea
...,...,...,...,...,...,...,...,...,...,...,...,...
369,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
370,Female,59,Nurse,8.0,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
371,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea
372,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,Sleep Apnea


In [100]:
Gender = pd.get_dummies(sleep_df['Gender'])
Gender.drop(columns = ['Male'],inplace = True)
Gender

Unnamed: 0,Female
0,0
1,0
2,0
3,0
4,0
...,...
369,1
370,1
371,1
372,1


In [101]:
Occupation = pd.get_dummies(sleep_df['Occupation'])
Occupation.drop(columns = ['Accountant'],inplace = True)
Occupation

Unnamed: 0,Doctor,Engineer,Lawyer,Manager,Nurse,Sales Representative,Salesperson,Scientist,Software Engineer,Teacher
0,0,0,0,0,0,0,0,0,1,0
1,1,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
369,0,0,0,0,1,0,0,0,0,0
370,0,0,0,0,1,0,0,0,0,0
371,0,0,0,0,1,0,0,0,0,0
372,0,0,0,0,1,0,0,0,0,0


In [102]:
BMI = pd.get_dummies(sleep_df['BMI Category'])
BMI.drop(columns = ['Normal'],inplace = True)
BMI

Unnamed: 0,Normal Weight,Obese,Overweight
0,0,0,1
1,0,0,0
2,0,0,0
3,0,1,0
4,0,1,0
...,...,...,...
369,0,0,1
370,0,0,1
371,0,0,1
372,0,0,1


In [103]:
sleep_df['Sleep Disorder'].value_counts()

None           219
Sleep Apnea     78
Insomnia        77
Name: Sleep Disorder, dtype: int64

In [104]:
sleep_df['Sleep Disorder'] = sleep_df['Sleep Disorder'].map({'None':0,'Sleep Apnea':1,'Insomnia':2})
sleep_df

Unnamed: 0,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Blood Pressure,Heart Rate,Daily Steps,Sleep Disorder
0,Male,27,Software Engineer,6.1,6,42,6,Overweight,126/83,77,4200,0
1,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,0
2,Male,28,Doctor,6.2,6,60,8,Normal,125/80,75,10000,0
3,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,1
4,Male,28,Sales Representative,5.9,4,30,8,Obese,140/90,85,3000,1
...,...,...,...,...,...,...,...,...,...,...,...,...
369,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,1
370,Female,59,Nurse,8.0,9,75,3,Overweight,140/95,68,7000,1
371,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,1
372,Female,59,Nurse,8.1,9,75,3,Overweight,140/95,68,7000,1


In [105]:
sleep_df.drop(columns = ['Gender','Occupation','BMI Category','Blood Pressure'],inplace = True)
sleep_df = pd.concat([Gender,sleep_df,Occupation,BMI],axis = 'columns')
sleep_df

Unnamed: 0,Female,Age,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,Heart Rate,Daily Steps,Sleep Disorder,Doctor,...,Manager,Nurse,Sales Representative,Salesperson,Scientist,Software Engineer,Teacher,Normal Weight,Obese,Overweight
0,0,27,6.1,6,42,6,77,4200,0,0,...,0,0,0,0,0,1,0,0,0,1
1,0,28,6.2,6,60,8,75,10000,0,1,...,0,0,0,0,0,0,0,0,0,0
2,0,28,6.2,6,60,8,75,10000,0,1,...,0,0,0,0,0,0,0,0,0,0
3,0,28,5.9,4,30,8,85,3000,1,0,...,0,0,1,0,0,0,0,0,1,0
4,0,28,5.9,4,30,8,85,3000,1,0,...,0,0,1,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
369,1,59,8.1,9,75,3,68,7000,1,0,...,0,1,0,0,0,0,0,0,0,1
370,1,59,8.0,9,75,3,68,7000,1,0,...,0,1,0,0,0,0,0,0,0,1
371,1,59,8.1,9,75,3,68,7000,1,0,...,0,1,0,0,0,0,0,0,0,1
372,1,59,8.1,9,75,3,68,7000,1,0,...,0,1,0,0,0,0,0,0,0,1


In [106]:
X = sleep_df.drop(columns = 'Sleep Disorder')
y = sleep_df['Sleep Disorder']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(299, 21)
(75, 21)
(299,)
(75,)


In [107]:
rf_clf.fit(X_train,y_train)

In [108]:
y_predicted = rf_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.95      0.95      0.95        43
           1       0.85      0.69      0.76        16
           2       0.68      0.81      0.74        16

    accuracy                           0.87        75
   macro avg       0.83      0.82      0.82        75
weighted avg       0.87      0.87      0.87        75



In [109]:
bagging_dt_clf.fit(X_train,y_train)



In [110]:
y_predicted = bagging_dt_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.95      0.98      0.97        43
           1       0.85      0.69      0.76        16
           2       0.72      0.81      0.76        16

    accuracy                           0.88        75
   macro avg       0.84      0.83      0.83        75
weighted avg       0.88      0.88      0.88        75



In [111]:
bagging_nb_clf.fit(X_train,y_train)



In [112]:
y_predicted = bagging_nb_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.93      0.93      0.93        43
           1       0.92      0.69      0.79        16
           2       0.70      0.88      0.78        16

    accuracy                           0.87        75
   macro avg       0.85      0.83      0.83        75
weighted avg       0.88      0.87      0.87        75



In [113]:
adaboost_dt_clf.fit(X_train,y_train)



In [114]:
y_predicted = adaboost_dt_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.95      0.95      0.95        43
           1       0.85      0.69      0.76        16
           2       0.68      0.81      0.74        16

    accuracy                           0.87        75
   macro avg       0.83      0.82      0.82        75
weighted avg       0.87      0.87      0.87        75



In [115]:
adaboost_nb_clf.fit(X_train,y_train)



In [116]:
y_predicted = adaboost_nb_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.85      0.77      0.80        43
           1       0.81      0.81      0.81        16
           2       0.45      0.56      0.50        16

    accuracy                           0.73        75
   macro avg       0.70      0.71      0.71        75
weighted avg       0.75      0.73      0.74        75



# Applying Models on Mobile Price Prediction dataset

In [118]:
mobile_df = pd.read_csv('Data/mobile_price_prediction.csv')
mobile_df

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,...,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,...,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,...,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,...,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,...,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,...,1208,1212,1411,8,2,15,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,794,1,0.5,1,0,1,2,0.8,106,6,...,1222,1890,668,13,4,19,1,1,0,0
1996,1965,1,2.6,1,0,0,39,0.2,187,4,...,915,1965,2032,11,10,16,1,1,1,2
1997,1911,0,0.9,1,1,1,36,0.7,108,8,...,868,1632,3057,9,1,5,1,1,0,3
1998,1512,0,0.9,0,4,1,46,0.1,145,5,...,336,670,869,18,10,19,1,1,1,0


In [119]:
mobile_df.isna().sum()

battery_power    0
blue             0
clock_speed      0
dual_sim         0
fc               0
four_g           0
int_memory       0
m_dep            0
mobile_wt        0
n_cores          0
pc               0
px_height        0
px_width         0
ram              0
sc_h             0
sc_w             0
talk_time        0
three_g          0
touch_screen     0
wifi             0
price_range      0
dtype: int64

In [120]:
X = mobile_df.drop(columns = 'price_range')
y = mobile_df['price_range']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1600, 20)
(400, 20)
(1600,)
(400,)


In [121]:
rf_clf.fit(X_train,y_train)

In [122]:
y_predicted = rf_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.86      0.89      0.87       105
           1       0.68      0.69      0.68        91
           2       0.71      0.76      0.74        92
           3       0.93      0.84      0.88       112

    accuracy                           0.80       400
   macro avg       0.80      0.79      0.79       400
weighted avg       0.81      0.80      0.80       400



In [123]:
bagging_dt_clf.fit(X_train,y_train)



In [124]:
y_predicted = bagging_dt_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.91      0.90      0.91       105
           1       0.80      0.86      0.83        91
           2       0.80      0.82      0.81        92
           3       0.93      0.87      0.90       112

    accuracy                           0.86       400
   macro avg       0.86      0.86      0.86       400
weighted avg       0.87      0.86      0.86       400



In [125]:
bagging_nb_clf.fit(X_train,y_train)



In [126]:
y_predicted = bagging_nb_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.90      0.92      0.91       105
           1       0.78      0.67      0.72        91
           2       0.65      0.77      0.70        92
           3       0.88      0.82      0.85       112

    accuracy                           0.80       400
   macro avg       0.80      0.80      0.80       400
weighted avg       0.81      0.80      0.80       400



In [127]:
adaboost_dt_clf.fit(X_train,y_train)



In [128]:
y_predicted = adaboost_dt_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.94      0.87      0.90       105
           1       0.72      0.85      0.78        91
           2       0.75      0.65      0.70        92
           3       0.86      0.89      0.88       112

    accuracy                           0.82       400
   macro avg       0.82      0.81      0.81       400
weighted avg       0.82      0.82      0.82       400



In [129]:
adaboost_nb_clf.fit(X_train,y_train)



In [130]:
y_predicted = adaboost_nb_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.91      0.29      0.43       105
           1       0.37      0.79      0.50        91
           2       0.35      0.36      0.35        92
           3       0.80      0.54      0.64       112

    accuracy                           0.49       400
   macro avg       0.61      0.49      0.48       400
weighted avg       0.63      0.49      0.49       400



# Applying Models on Loan Approval Prediction dataset

In [149]:
loan_df = pd.read_csv('Data/loan_approval_dataset.csv')
loan_df.drop(columns = ['loan_id'],inplace = True)
loan_df

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,Approved
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,Rejected
2,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,Rejected
3,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,Rejected
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,Rejected
...,...,...,...,...,...,...,...,...,...,...,...,...
4264,5,Graduate,Yes,1000000,2300000,12,317,2800000,500000,3300000,800000,Rejected
4265,0,Not Graduate,Yes,3300000,11300000,20,559,4200000,2900000,11000000,1900000,Approved
4266,2,Not Graduate,No,6500000,23900000,18,457,1200000,12400000,18100000,7300000,Rejected
4267,1,Not Graduate,No,4100000,12800000,8,780,8200000,700000,14100000,5800000,Approved


In [150]:
loan_df.columns

Index([' no_of_dependents', ' education', ' self_employed', ' income_annum',
       ' loan_amount', ' loan_term', ' cibil_score',
       ' residential_assets_value', ' commercial_assets_value',
       ' luxury_assets_value', ' bank_asset_value', ' loan_status'],
      dtype='object')

In [157]:
loan_df.columns = loan_df.columns.str.strip()

In [163]:
loan_df['loan_status'] = loan_df['loan_status'].str.strip()

In [164]:
loan_df['loan_status'] = loan_df['loan_status'].map({'Approved':1,'Rejected':0})
loan_df

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,Graduate,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1
1,0,Not Graduate,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0
2,3,Graduate,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0
3,3,Graduate,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4264,5,Graduate,Yes,1000000,2300000,12,317,2800000,500000,3300000,800000,0
4265,0,Not Graduate,Yes,3300000,11300000,20,559,4200000,2900000,11000000,1900000,1
4266,2,Not Graduate,No,6500000,23900000,18,457,1200000,12400000,18100000,7300000,0
4267,1,Not Graduate,No,4100000,12800000,8,780,8200000,700000,14100000,5800000,1


In [165]:
loan_df['education'].value_counts()

Graduate        2144
Not Graduate    2125
Name: education, dtype: int64

In [168]:
education = pd.get_dummies(loan_df['education'])
education.drop(columns = 'Not Graduate',inplace = True)
education

Unnamed: 0,Graduate
0,1
1,0
2,1
3,1
4,0
...,...
4264,1
4265,0
4266,0
4267,0


In [170]:
loan_df.drop(columns = ['education'],inplace = True)
loan_df = pd.concat([loan_df,education],axis = 'columns')
loan_df

Unnamed: 0,no_of_dependents,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status,Graduate
0,2,No,9600000,29900000,12,778,2400000,17600000,22700000,8000000,1,1
1,0,Yes,4100000,12200000,8,417,2700000,2200000,8800000,3300000,0,0
2,3,No,9100000,29700000,20,506,7100000,4500000,33300000,12800000,0,1
3,3,No,8200000,30700000,8,467,18200000,3300000,23300000,7900000,0,1
4,5,Yes,9800000,24200000,20,382,12400000,8200000,29400000,5000000,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
4264,5,Yes,1000000,2300000,12,317,2800000,500000,3300000,800000,0,1
4265,0,Yes,3300000,11300000,20,559,4200000,2900000,11000000,1900000,1,0
4266,2,No,6500000,23900000,18,457,1200000,12400000,18100000,7300000,0,0
4267,1,No,4100000,12800000,8,780,8200000,700000,14100000,5800000,1,0


In [177]:
loan_df['self_employed'].value_counts()

Yes    2150
No     2119
Name: self_employed, dtype: int64

In [178]:
loan_df['self_employed'] = loan_df['self_employed'].map({'Yes':1,'No':0})

In [179]:
X = loan_df.drop(columns = 'loan_status')
y = loan_df['loan_status']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(3415, 11)
(854, 11)
(3415,)
(854,)


In [180]:
rf_clf.fit(X_train,y_train)

In [181]:
y_predicted = rf_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.98      0.98      0.98       318
           1       0.99      0.99      0.99       536

    accuracy                           0.98       854
   macro avg       0.98      0.98      0.98       854
weighted avg       0.98      0.98      0.98       854



In [182]:
bagging_dt_clf.fit(X_train,y_train)



In [183]:
y_predicted = bagging_dt_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.98      0.97      0.97       318
           1       0.98      0.99      0.98       536

    accuracy                           0.98       854
   macro avg       0.98      0.98      0.98       854
weighted avg       0.98      0.98      0.98       854



In [184]:
bagging_nb_clf.fit(X_train,y_train)



In [185]:
y_predicted = bagging_nb_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.91      0.42      0.58       318
           1       0.74      0.98      0.84       536

    accuracy                           0.77       854
   macro avg       0.83      0.70      0.71       854
weighted avg       0.80      0.77      0.74       854



In [186]:
adaboost_dt_clf.fit(X_train,y_train)



In [187]:
y_predicted = adaboost_dt_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.98      0.96      0.97       318
           1       0.98      0.99      0.98       536

    accuracy                           0.98       854
   macro avg       0.98      0.98      0.98       854
weighted avg       0.98      0.98      0.98       854



In [188]:
adaboost_nb_clf.fit(X_train,y_train)



In [189]:
y_predicted = adaboost_nb_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.48      0.18      0.26       318
           1       0.64      0.89      0.75       536

    accuracy                           0.62       854
   macro avg       0.56      0.53      0.50       854
weighted avg       0.58      0.62      0.57       854



# Applying Models on Pistachio Types Ditection dataset

In [190]:
pistachio_df = pd.read_csv('Data/pistachio.csv')
pistachio_df

Unnamed: 0,AREA,PERIMETER,MAJOR_AXIS,MINOR_AXIS,ECCENTRICITY,EQDIASQ,SOLIDITY,CONVEX_AREA,EXTENT,ASPECT_RATIO,ROUNDNESS,COMPACTNESS,SHAPEFACTOR_1,SHAPEFACTOR_2,SHAPEFACTOR_3,SHAPEFACTOR_4,Class
0,73107,1161.8070,442.4074,217.7261,0.8705,305.0946,0.9424,77579,0.7710,2.0319,0.6806,0.6896,0.0061,0.0030,0.4756,0.9664,Kirmizi_Pistachio
1,89272,1173.1810,460.2551,251.9546,0.8369,337.1419,0.9641,92598,0.7584,1.8267,0.8151,0.7325,0.0052,0.0028,0.5366,0.9802,Siit_Pistachio
2,60955,999.7890,386.9247,209.1255,0.8414,278.5863,0.9465,64400,0.7263,1.8502,0.7663,0.7200,0.0063,0.0034,0.5184,0.9591,Kirmizi_Pistachio
3,79537,1439.5129,466.7973,221.2136,0.8806,318.2289,0.9437,84281,0.7568,2.1102,0.4823,0.6817,0.0059,0.0028,0.4648,0.9807,Kirmizi_Pistachio
4,96395,1352.6740,515.8730,246.5945,0.8784,350.3340,0.9549,100950,0.7428,2.0920,0.6620,0.6791,0.0054,0.0026,0.4612,0.9648,Kirmizi_Pistachio
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1713,65570,2071.4451,418.0258,217.1458,0.8545,288.9400,0.8976,73054,0.5945,1.9251,0.1920,0.6912,0.0064,0.0033,0.4778,0.9197,Kirmizi_Pistachio
1714,68849,1441.2590,451.0457,205.2553,0.8905,296.0764,0.9340,73716,0.6459,2.1975,0.4165,0.6564,0.0066,0.0030,0.4309,0.9469,Kirmizi_Pistachio
1715,90270,1370.5380,428.9636,269.8232,0.7774,339.0211,0.9722,92847,0.7400,1.5898,0.6039,0.7903,0.0048,0.0030,0.6246,0.9930,Siit_Pistachio
1716,73148,1309.8430,469.0491,208.3141,0.8960,305.1801,0.9376,78014,0.6341,2.2516,0.5358,0.6506,0.0064,0.0028,0.4233,0.9532,Kirmizi_Pistachio


In [191]:
pistachio_df.isna().sum()

AREA             0
PERIMETER        0
MAJOR_AXIS       0
MINOR_AXIS       0
ECCENTRICITY     0
EQDIASQ          0
SOLIDITY         0
CONVEX_AREA      0
EXTENT           0
ASPECT_RATIO     0
ROUNDNESS        0
COMPACTNESS      0
SHAPEFACTOR_1    0
SHAPEFACTOR_2    0
SHAPEFACTOR_3    0
SHAPEFACTOR_4    0
Class            0
dtype: int64

In [193]:
pistachio_df['Class'].value_counts()

Kirmizi_Pistachio    998
Siit_Pistachio       720
Name: Class, dtype: int64

In [194]:
pistachio_df['Class'] = pistachio_df['Class'].map({'Kirmizi_Pistachio':1,'Siit_Pistachio':2})
pistachio_df

Unnamed: 0,AREA,PERIMETER,MAJOR_AXIS,MINOR_AXIS,ECCENTRICITY,EQDIASQ,SOLIDITY,CONVEX_AREA,EXTENT,ASPECT_RATIO,ROUNDNESS,COMPACTNESS,SHAPEFACTOR_1,SHAPEFACTOR_2,SHAPEFACTOR_3,SHAPEFACTOR_4,Class
0,73107,1161.8070,442.4074,217.7261,0.8705,305.0946,0.9424,77579,0.7710,2.0319,0.6806,0.6896,0.0061,0.0030,0.4756,0.9664,1
1,89272,1173.1810,460.2551,251.9546,0.8369,337.1419,0.9641,92598,0.7584,1.8267,0.8151,0.7325,0.0052,0.0028,0.5366,0.9802,2
2,60955,999.7890,386.9247,209.1255,0.8414,278.5863,0.9465,64400,0.7263,1.8502,0.7663,0.7200,0.0063,0.0034,0.5184,0.9591,1
3,79537,1439.5129,466.7973,221.2136,0.8806,318.2289,0.9437,84281,0.7568,2.1102,0.4823,0.6817,0.0059,0.0028,0.4648,0.9807,1
4,96395,1352.6740,515.8730,246.5945,0.8784,350.3340,0.9549,100950,0.7428,2.0920,0.6620,0.6791,0.0054,0.0026,0.4612,0.9648,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1713,65570,2071.4451,418.0258,217.1458,0.8545,288.9400,0.8976,73054,0.5945,1.9251,0.1920,0.6912,0.0064,0.0033,0.4778,0.9197,1
1714,68849,1441.2590,451.0457,205.2553,0.8905,296.0764,0.9340,73716,0.6459,2.1975,0.4165,0.6564,0.0066,0.0030,0.4309,0.9469,1
1715,90270,1370.5380,428.9636,269.8232,0.7774,339.0211,0.9722,92847,0.7400,1.5898,0.6039,0.7903,0.0048,0.0030,0.6246,0.9930,2
1716,73148,1309.8430,469.0491,208.3141,0.8960,305.1801,0.9376,78014,0.6341,2.2516,0.5358,0.6506,0.0064,0.0028,0.4233,0.9532,1


In [195]:
X = pistachio_df.drop(columns = 'Class')
y = pistachio_df['Class']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1374, 16)
(344, 16)
(1374,)
(344,)


In [196]:
rf_clf.fit(X_train,y_train)

In [197]:
y_predicted = rf_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           1       0.87      0.90      0.88       201
           2       0.85      0.80      0.82       143

    accuracy                           0.86       344
   macro avg       0.86      0.85      0.85       344
weighted avg       0.86      0.86      0.86       344



In [198]:
bagging_dt_clf.fit(X_train,y_train)



In [199]:
y_predicted = bagging_dt_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           1       0.88      0.91      0.90       201
           2       0.87      0.83      0.85       143

    accuracy                           0.88       344
   macro avg       0.88      0.87      0.87       344
weighted avg       0.88      0.88      0.88       344



In [200]:
bagging_nb_clf.fit(X_train,y_train)



In [201]:
y_predicted = bagging_nb_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           1       0.85      0.76      0.81       201
           2       0.71      0.82      0.76       143

    accuracy                           0.78       344
   macro avg       0.78      0.79      0.78       344
weighted avg       0.79      0.78      0.79       344



In [202]:
adaboost_dt_clf.fit(X_train,y_train)



In [203]:
y_predicted = adaboost_dt_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           1       0.84      0.86      0.85       201
           2       0.79      0.77      0.78       143

    accuracy                           0.82       344
   macro avg       0.82      0.81      0.81       344
weighted avg       0.82      0.82      0.82       344



In [204]:
adaboost_nb_clf.fit(X_train,y_train)



In [205]:
y_predicted = adaboost_nb_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           1       0.96      0.12      0.22       201
           2       0.45      0.99      0.62       143

    accuracy                           0.49       344
   macro avg       0.70      0.56      0.42       344
weighted avg       0.75      0.49      0.38       344



# Applying Model on Heart Failure Prediction dataset

In [227]:
heart_df = pd.read_csv('Data/heart.csv')
heart_df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,M,TA,110,264,0,Normal,132,N,1.2,Flat,1
914,68,M,ASY,144,193,1,Normal,141,N,3.4,Flat,1
915,57,M,ASY,130,131,0,Normal,115,Y,1.2,Flat,1
916,57,F,ATA,130,236,0,LVH,174,N,0.0,Flat,1


In [228]:
heart_df.isna().sum()

Age               0
Sex               0
ChestPainType     0
RestingBP         0
Cholesterol       0
FastingBS         0
RestingECG        0
MaxHR             0
ExerciseAngina    0
Oldpeak           0
ST_Slope          0
HeartDisease      0
dtype: int64

In [229]:
heart_df['Sex'].value_counts()

M    725
F    193
Name: Sex, dtype: int64

In [230]:
Sex = pd.get_dummies(heart_df['Sex'])
Sex.drop(columns = ['M'],inplace = True)
Sex

Unnamed: 0,F
0,0
1,1
2,0
3,1
4,0
...,...
913,0
914,0
915,0
916,1


In [231]:
heart_df['ChestPainType'].value_counts()

ASY    496
NAP    203
ATA    173
TA      46
Name: ChestPainType, dtype: int64

In [232]:
ChestPainType = pd.get_dummies(heart_df['ChestPainType'])
ChestPainType.drop(columns = ['ASY'],inplace = True)
ChestPainType

Unnamed: 0,ATA,NAP,TA
0,1,0,0
1,0,1,0
2,1,0,0
3,0,0,0
4,0,1,0
...,...,...,...
913,0,0,1
914,0,0,0
915,0,0,0
916,1,0,0


In [233]:
heart_df['RestingECG'].value_counts()

Normal    552
LVH       188
ST        178
Name: RestingECG, dtype: int64

In [234]:
RestingECG = pd.get_dummies(heart_df['RestingECG'])
RestingECG.drop(columns = ['Normal'],inplace = True)
RestingECG

Unnamed: 0,LVH,ST
0,0,0
1,0,0
2,0,1
3,0,0
4,0,0
...,...,...
913,0,0
914,0,0
915,0,0
916,1,0


In [235]:
heart_df['ExerciseAngina'].value_counts()

N    547
Y    371
Name: ExerciseAngina, dtype: int64

In [236]:
ExerciseAngina = pd.get_dummies(heart_df['ExerciseAngina'])
ExerciseAngina.drop(columns = ['Y'],inplace = True)
ExerciseAngina

Unnamed: 0,N
0,1
1,1
2,1
3,0
4,1
...,...
913,1
914,1
915,0
916,1


In [237]:
heart_df['ST_Slope'].value_counts()

Flat    460
Up      395
Down     63
Name: ST_Slope, dtype: int64

In [238]:
ST_Slope = pd.get_dummies(heart_df['ST_Slope'])
ST_Slope.drop(columns = ['Flat'],inplace = True)
ST_Slope

Unnamed: 0,Down,Up
0,0,1
1,0,0
2,0,1
3,0,0
4,0,1
...,...,...
913,0,0
914,0,0
915,0,0
916,0,0


In [240]:
heart_df.drop(columns = ['Sex','ChestPainType','ExerciseAngina','RestingECG','ST_Slope'],inplace = True)
heart_df = pd.concat([heart_df,Sex,ChestPainType,ExerciseAngina,RestingECG,ST_Slope],axis = 'columns')
heart_df

Unnamed: 0,Age,RestingBP,Cholesterol,FastingBS,MaxHR,Oldpeak,HeartDisease,F,ATA,NAP,TA,N,LVH,ST,Down,Up
0,40,140,289,0,172,0.0,0,0,1,0,0,1,0,0,0,1
1,49,160,180,0,156,1.0,1,1,0,1,0,1,0,0,0,0
2,37,130,283,0,98,0.0,0,0,1,0,0,1,0,1,0,1
3,48,138,214,0,108,1.5,1,1,0,0,0,0,0,0,0,0
4,54,150,195,0,122,0.0,0,0,0,1,0,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,110,264,0,132,1.2,1,0,0,0,1,1,0,0,0,0
914,68,144,193,1,141,3.4,1,0,0,0,0,1,0,0,0,0
915,57,130,131,0,115,1.2,1,0,0,0,0,0,0,0,0,0
916,57,130,236,0,174,0.0,1,1,1,0,0,1,1,0,0,0


In [241]:
X = heart_df.drop(columns = 'HeartDisease')
y = heart_df['HeartDisease']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state=42)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(734, 15)
(184, 15)
(734,)
(184,)


In [242]:
rf_clf.fit(X_train,y_train)

In [243]:
y_predicted = rf_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.82      0.91      0.86        77
           1       0.93      0.86      0.89       107

    accuracy                           0.88       184
   macro avg       0.88      0.88      0.88       184
weighted avg       0.89      0.88      0.88       184



In [244]:
bagging_dt_clf.fit(X_train,y_train)



In [245]:
y_predicted = bagging_dt_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.75      0.86      0.80        77
           1       0.89      0.79      0.84       107

    accuracy                           0.82       184
   macro avg       0.82      0.83      0.82       184
weighted avg       0.83      0.82      0.82       184



In [246]:
bagging_nb_clf.fit(X_train,y_train)



In [247]:
y_predicted = bagging_nb_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.77      0.87      0.82        77
           1       0.90      0.81      0.85       107

    accuracy                           0.84       184
   macro avg       0.83      0.84      0.84       184
weighted avg       0.84      0.84      0.84       184



In [248]:
adaboost_dt_clf.fit(X_train,y_train)



In [249]:
y_predicted = adaboost_dt_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.81      0.82      0.81        77
           1       0.87      0.86      0.86       107

    accuracy                           0.84       184
   macro avg       0.84      0.84      0.84       184
weighted avg       0.84      0.84      0.84       184



In [250]:
adaboost_nb_clf.fit(X_train,y_train)



In [251]:
y_predicted = adaboost_nb_clf.predict(X_test)
cr = classification_report(y_test,y_predicted)
print(cr)

              precision    recall  f1-score   support

           0       0.40      0.64      0.49        77
           1       0.55      0.32      0.40       107

    accuracy                           0.45       184
   macro avg       0.48      0.48      0.45       184
weighted avg       0.49      0.45      0.44       184

