In [None]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score

In [None]:
data1 = pd.read_csv("/content/placementdata.csv")

In [None]:
print(data1.head())

   StudentID  CGPA  Internships  Projects  Workshops/Certifications  \
0          1   7.5            1         1                         1   
1          2   8.9            0         3                         2   
2          3   7.3            1         2                         2   
3          4   7.5            1         1                         2   
4          5   8.3            1         2                         2   

   AptitudeTestScore  SoftSkillsRating ExtracurricularActivities  \
0                 65               4.4                        No   
1                 90               4.0                       Yes   
2                 82               4.8                       Yes   
3                 85               4.4                       Yes   
4                 86               4.5                       Yes   

  PlacementTraining  SSC_Marks  HSC_Marks PlacementStatus  
0                No         61         79       NotPlaced  
1               Yes         78         82   

In [None]:
print(data1.isnull().sum())

StudentID                    0
CGPA                         0
Internships                  0
Projects                     0
Workshops/Certifications     0
AptitudeTestScore            0
SoftSkillsRating             0
ExtracurricularActivities    0
PlacementTraining            0
SSC_Marks                    0
HSC_Marks                    0
PlacementStatus              0
dtype: int64


In [None]:
print(data1.describe())

         StudentID          CGPA   Internships      Projects  \
count  10000.00000  10000.000000  10000.000000  10000.000000   
mean    5000.50000      7.698010      1.049200      2.026600   
std     2886.89568      0.640131      0.665901      0.867968   
min        1.00000      6.500000      0.000000      0.000000   
25%     2500.75000      7.400000      1.000000      1.000000   
50%     5000.50000      7.700000      1.000000      2.000000   
75%     7500.25000      8.200000      1.000000      3.000000   
max    10000.00000      9.100000      2.000000      3.000000   

       Workshops/Certifications  AptitudeTestScore  SoftSkillsRating  \
count              10000.000000       10000.000000      10000.000000   
mean                   1.013200          79.449900          4.323960   
std                    0.904272           8.159997          0.411622   
min                    0.000000          60.000000          3.000000   
25%                    0.000000          73.000000          4.0

In [None]:
extracurricular = LabelEncoder()
data1['ExtracurricularActivities'] = extracurricular.fit_transform(data1['ExtracurricularActivities'])

placementtraining = LabelEncoder()
data1['PlacementTraining'] = placementtraining.fit_transform(data1['PlacementTraining'])

placementstatus = LabelEncoder()
data1['PlacementStatus'] = placementstatus.fit_transform(data1['PlacementStatus'])

In [None]:
print(data1.head())

   StudentID  CGPA  Internships  Projects  Workshops/Certifications  \
0          1   7.5            1         1                         1   
1          2   8.9            0         3                         2   
2          3   7.3            1         2                         2   
3          4   7.5            1         1                         2   
4          5   8.3            1         2                         2   

   AptitudeTestScore  SoftSkillsRating  ExtracurricularActivities  \
0                 65               4.4                          0   
1                 90               4.0                          1   
2                 82               4.8                          1   
3                 85               4.4                          1   
4                 86               4.5                          1   

   PlacementTraining  SSC_Marks  HSC_Marks  PlacementStatus  
0                  0         61         79                0  
1                  1         78   

In [None]:
feature_cols = ['CGPA','Internships', 'Projects', 'Workshops/Certifications', 'AptitudeTestScore', 'SoftSkillsRating', 'ExtracurricularActivities']

X1 = data1[feature_cols]
Y1 = data1['PlacementStatus']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X1, Y1, test_size=0.3, random_state=1)

In [None]:
dt = DecisionTreeClassifier(max_depth=100)
dt.fit(X_train, Y_train)

In [None]:
dt_train_pred = dt.predict(X_train)
dt_test_pred = dt.predict(X_test)

In [None]:
train_acc = accuracy_score(Y_train, dt_train_pred)
print(f"Training accuracy using decision tree: {train_acc:.4f}")

test_acc = accuracy_score(Y_test, dt_test_pred)
print(f"Testing accuracy using decision tree: {test_acc:.4f}")

Training accuracy using decision tree: 0.9841
Testing accuracy using decision tree: 0.7123


Overfitting Occurs

Modifying the max depth parameter

In [None]:
print("Decision Tree with Different max_depth Values:")
for depth in [3, 5, 7, 10]:
    dt = DecisionTreeClassifier(max_depth=depth)
    dt.fit(X_train, Y_train)
    dt_train_pred = dt.predict(X_train)
    dt_test_pred = dt.predict(X_test)
    train_acc = accuracy_score(Y_train, dt_train_pred)
    test_acc = accuracy_score(Y_test, dt_test_pred)
    print(f"max_depth = {depth}: Training Accuracy = {train_acc:.4f}, Testing Accuracy = {test_acc:.4f}")

Decision Tree with Different max_depth Values:
max_depth = 3: Training Accuracy = 0.7774, Testing Accuracy = 0.7797
max_depth = 5: Training Accuracy = 0.7936, Testing Accuracy = 0.7793
max_depth = 7: Training Accuracy = 0.8091, Testing Accuracy = 0.7790
max_depth = 10: Training Accuracy = 0.8483, Testing Accuracy = 0.7673


max_depth = 3: Training Accuracy = 0.7774, Testing Accuracy = 0.7797

Random Forest

In [None]:
random1 = RandomForestClassifier(n_estimators=100, max_depth=100)
random1.fit(X_train, Y_train)

In [None]:
random_train_pred = random1.predict(X_train)
print("Training accuracy for Random forest without feature importance: ",accuracy_score(Y_train, random_train_pred))

Training accuracy for Random forest without feature importance:  0.9841428571428571


In [None]:
random_test_pred = random1.predict(X_test)
print("Testing accuracy for Random forest without feature importance: ",accuracy_score(Y_test, random_test_pred))

Testing accuracy for Random forest without feature importance:  0.774


Overfitting

In [None]:
print("Random Forest with Different n_estimators Values")
for estimators in range(1, 10, 1):
    rf = RandomForestClassifier(n_estimators=estimators, max_depth=4)
    rf.fit(X_train, Y_train)
    rf_train_pred = rf.predict(X_train)
    rf_test_pred = rf.predict(X_test)
    train_acc = accuracy_score(Y_train, rf_train_pred)
    test_acc = accuracy_score(Y_test, rf_test_pred)
    print(f"n_estimators = {estimators}: Training Accuracy = {train_acc:.4f}, Testing Accuracy = {test_acc:.4f}")

Random Forest with Different n_estimators Values
n_estimators = 1: Training Accuracy = 0.7693, Testing Accuracy = 0.7803
n_estimators = 2: Training Accuracy = 0.7836, Testing Accuracy = 0.7843
n_estimators = 3: Training Accuracy = 0.7876, Testing Accuracy = 0.7937
n_estimators = 4: Training Accuracy = 0.7874, Testing Accuracy = 0.7977
n_estimators = 5: Training Accuracy = 0.7889, Testing Accuracy = 0.7997
n_estimators = 6: Training Accuracy = 0.7890, Testing Accuracy = 0.7953
n_estimators = 7: Training Accuracy = 0.7920, Testing Accuracy = 0.7813
n_estimators = 8: Training Accuracy = 0.7916, Testing Accuracy = 0.7970
n_estimators = 9: Training Accuracy = 0.7929, Testing Accuracy = 0.7897


**With Feature Importance**

Decision Tree

In [None]:
feature_imp = pd.Series(dt.feature_importances_, index=feature_cols).sort_values(ascending=False)
print(feature_imp)
print(data1.columns)

ExtracurricularActivities    0.411554
AptitudeTestScore            0.270015
CGPA                         0.111713
SoftSkillsRating             0.094330
Workshops/Certifications     0.045163
Projects                     0.041737
Internships                  0.025489
dtype: float64
Index(['StudentID', 'CGPA', 'Internships', 'Projects',
       'Workshops/Certifications', 'AptitudeTestScore', 'SoftSkillsRating',
       'ExtracurricularActivities', 'PlacementTraining', 'SSC_Marks',
       'HSC_Marks', 'PlacementStatus'],
      dtype='object')


In [None]:
fea_cols = ['CGPA', 'AptitudeTestScore', 'SoftSkillsRating', 'ExtracurricularActivities']
X = data1[fea_cols]
Y = data1['PlacementStatus']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1)

In [None]:
dt = DecisionTreeClassifier(max_depth=100)
dt.fit(X_train, Y_train)

In [None]:
dt_train_pred = dt.predict(X_train)
train_acc = accuracy_score(Y_train, dt_train_pred)
print(f"Training accuracy using decision tree: {train_acc:.4f}")

Training accuracy using decision tree: 0.9189


In [None]:
dt_test_pred = dt.predict(X_test)
test_acc = accuracy_score(Y_test, dt_test_pred)
print(f"Testing accuracy using decision tree: {test_acc:.4f}")

Testing accuracy using decision tree: 0.7153


Overfitting

In [None]:
print("Decision Tree with Different max_depth Values:")
for depth in range(1,10,1):
    dt = DecisionTreeClassifier(max_depth=depth)
    dt.fit(X_train, Y_train)
    dt_train_pred = dt.predict(X_train)
    dt_test_pred = dt.predict(X_test)
    train_acc = accuracy_score(Y_train, dt_train_pred)
    test_acc = accuracy_score(Y_test, dt_test_pred)
    print(f"max_depth = {depth}: Training Accuracy = {train_acc:.4f}, Testing Accuracy = {test_acc:.4f}")

Decision Tree with Different max_depth Values:
max_depth = 1: Training Accuracy = 0.7211, Testing Accuracy = 0.7203
max_depth = 2: Training Accuracy = 0.7637, Testing Accuracy = 0.7703
max_depth = 3: Training Accuracy = 0.7774, Testing Accuracy = 0.7797
max_depth = 4: Training Accuracy = 0.7860, Testing Accuracy = 0.7873
max_depth = 5: Training Accuracy = 0.7899, Testing Accuracy = 0.7743
max_depth = 6: Training Accuracy = 0.7936, Testing Accuracy = 0.7853
max_depth = 7: Training Accuracy = 0.8013, Testing Accuracy = 0.7847
max_depth = 8: Training Accuracy = 0.8101, Testing Accuracy = 0.7773
max_depth = 9: Training Accuracy = 0.8187, Testing Accuracy = 0.7700


In [None]:
random2 = RandomForestClassifier(n_estimators=100, max_depth=100)
random2.fit(X_train, Y_train)

In [None]:
random2_train_pred = random2.predict(X_train)
train_acc2 = accuracy_score(Y_train, random2_train_pred)
print(f"Training accuracy for Random forest with feature importance: {train_acc2:.4f}")

Training accuracy for Random forest without feature importance: 0.9187


In [None]:
random2_test_pred = random2.predict(X_test)
test_acc2 = accuracy_score(Y_test, random2_test_pred)
print(f"Testing accuracy for Random forest with feature importance: {test_acc2:.4f}")

Testing accuracy for Random forest without feature importance: 0.7477


Overfitting

In [None]:
print("Random Forest with Different n_estimators Values")
for estimators in range(1, 10, 1):
    rf = RandomForestClassifier(n_estimators=estimators, max_depth=4)
    rf.fit(X_train, Y_train)
    rf_train_pred = rf.predict(X_train)
    rf_test_pred = rf.predict(X_test)
    train_acc = accuracy_score(Y_train, rf_train_pred)
    test_acc = accuracy_score(Y_test, rf_test_pred)
    print(f"n_estimators = {estimators}: Training Accuracy = {train_acc:.4f}, Testing Accuracy = {test_acc:.4f}")

Random Forest with Different n_estimators Values
n_estimators = 1: Training Accuracy = 0.7794, Testing Accuracy = 0.7780
n_estimators = 2: Training Accuracy = 0.7841, Testing Accuracy = 0.7787
n_estimators = 3: Training Accuracy = 0.7880, Testing Accuracy = 0.7823
n_estimators = 4: Training Accuracy = 0.7790, Testing Accuracy = 0.7787
n_estimators = 5: Training Accuracy = 0.7839, Testing Accuracy = 0.7827
n_estimators = 6: Training Accuracy = 0.7881, Testing Accuracy = 0.7790
n_estimators = 7: Training Accuracy = 0.7896, Testing Accuracy = 0.7823
n_estimators = 8: Training Accuracy = 0.7883, Testing Accuracy = 0.7817
n_estimators = 9: Training Accuracy = 0.7857, Testing Accuracy = 0.7873


**IRIS DATASET**

Without Feature Importance

In [None]:
data2 = pd.read_csv("/content/Iris.csv")

In [None]:
print(data2.head())

   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm      Species
0   1            5.1           3.5            1.4           0.2  Iris-setosa
1   2            4.9           3.0            1.4           0.2  Iris-setosa
2   3            4.7           3.2            1.3           0.2  Iris-setosa
3   4            4.6           3.1            1.5           0.2  Iris-setosa
4   5            5.0           3.6            1.4           0.2  Iris-setosa


In [None]:
print(data2.columns)

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')


In [None]:
print(data2.describe())

               Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm
count  150.000000     150.000000    150.000000     150.000000    150.000000
mean    75.500000       5.843333      3.054000       3.758667      1.198667
std     43.445368       0.828066      0.433594       1.764420      0.763161
min      1.000000       4.300000      2.000000       1.000000      0.100000
25%     38.250000       5.100000      2.800000       1.600000      0.300000
50%     75.500000       5.800000      3.000000       4.350000      1.300000
75%    112.750000       6.400000      3.300000       5.100000      1.800000
max    150.000000       7.900000      4.400000       6.900000      2.500000


In [None]:
print(data2.isnull().sum())

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64


In [None]:
le = LabelEncoder()
data2['Species'] = le.fit_transform(data2['Species'])

In [None]:
print(data2.head())

   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  Species
0   1            5.1           3.5            1.4           0.2        0
1   2            4.9           3.0            1.4           0.2        0
2   3            4.7           3.2            1.3           0.2        0
3   4            4.6           3.1            1.5           0.2        0
4   5            5.0           3.6            1.4           0.2        0


In [None]:
feature_cols = ['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']
X = data2[feature_cols]
Y = data2['Species']

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1)

In [None]:
dt2 = DecisionTreeClassifier(max_depth=100)
dt2.fit(X_train, Y_train)

In [None]:
dt_train_pred2 = dt2.predict(X_train)
train_acc2 = accuracy_score(Y_train, dt_train_pred2)
print(f"Training accuracy using decision tree: {train_acc2:.4f}")

Training accuracy using decision tree: 1.0000


In [None]:
dt_test_pred2 = dt2.predict(X_test)
test_acc2 = accuracy_score(Y_test, dt_test_pred2)
print(f"Testing accuracy using decision tree: {test_acc2:.4f}")

Testing accuracy using decision tree: 0.9556


In [None]:
print("Decision Tree with Different max_depth Values")
for depth in range(1,10,1):
    dt3 = DecisionTreeClassifier(max_depth=depth)
    dt3.fit(X_train, Y_train)
    dt_train_pred = dt3.predict(X_train)
    dt_test_pred = dt3.predict(X_test)
    train_acc = accuracy_score(Y_train, dt_train_pred)
    test_acc = accuracy_score(Y_test, dt_test_pred)
    print(f"max_depth = {depth}: Training Accuracy = {train_acc:.4f}, Testing Accuracy = {test_acc:.4f}")

Decision Tree with Different max_depth Values
max_depth = 1: Training Accuracy = 0.6952, Testing Accuracy = 0.6000
max_depth = 2: Training Accuracy = 0.9619, Testing Accuracy = 0.9556
max_depth = 3: Training Accuracy = 0.9810, Testing Accuracy = 0.9556
max_depth = 4: Training Accuracy = 1.0000, Testing Accuracy = 0.9556
max_depth = 5: Training Accuracy = 1.0000, Testing Accuracy = 0.9556
max_depth = 6: Training Accuracy = 1.0000, Testing Accuracy = 0.9556
max_depth = 7: Training Accuracy = 1.0000, Testing Accuracy = 0.9556
max_depth = 8: Training Accuracy = 1.0000, Testing Accuracy = 0.9556
max_depth = 9: Training Accuracy = 1.0000, Testing Accuracy = 0.9556


Max depth = 2

Random Forest

In [None]:
random3 = RandomForestClassifier(n_estimators=100, max_depth=100)
random3.fit(X_train, Y_train)

In [None]:
random_train_pred3 = random3.predict(X_train)
train_acc3 = accuracy_score(Y_train, random_train_pred3)
print(f"Training accuracy for Random forest without feature importance: {train_acc3:.4f}")

Training accuracy for Random forest without feature importance: 1.0000


In [None]:
random_test_pred3 = random3.predict(X_test)
test_acc3 = accuracy_score(Y_test, random_test_pred3)
print(f"Testing accuracy for Random forest without feature importance: {test_acc3:.4f}")

Testing accuracy for Random forest without feature importance: 0.9556


In [None]:
print("\Random Forest with Different n_estimators Values")
for estimators in range (1,10,1):
    rf3 = RandomForestClassifier(n_estimators=estimators, max_depth=5)
    rf3.fit(X_train, Y_train)
    rf_train_pred = rf3.predict(X_train)
    rf_test_pred = rf3.predict(X_test)
    train_acc = accuracy_score(Y_train, rf_train_pred)
    test_acc = accuracy_score(Y_test, rf_test_pred)
    print(f"n_estimators = {estimators}: Training Accuracy = {train_acc:.4f}, Testing Accuracy = {test_acc:.4f}")

\Random Forest with Different n_estimators Values
n_estimators = 1: Training Accuracy = 0.9905, Testing Accuracy = 0.9556
n_estimators = 2: Training Accuracy = 0.9714, Testing Accuracy = 0.9556
n_estimators = 3: Training Accuracy = 0.9905, Testing Accuracy = 0.9556
n_estimators = 4: Training Accuracy = 0.9619, Testing Accuracy = 0.9556
n_estimators = 5: Training Accuracy = 1.0000, Testing Accuracy = 0.9556
n_estimators = 6: Training Accuracy = 0.9905, Testing Accuracy = 0.9556
n_estimators = 7: Training Accuracy = 1.0000, Testing Accuracy = 0.9556
n_estimators = 8: Training Accuracy = 0.9905, Testing Accuracy = 0.9556
n_estimators = 9: Training Accuracy = 0.9905, Testing Accuracy = 0.9556


n_estimators = 4: Training Accuracy = 0.9619, Testing Accuracy = 0.9556


WITH FEATURE IMPORTANCE

In [None]:
feature_imp = pd.Series(dt2.feature_importances_, index=['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']).sort_values(ascending=False)
print(feature_imp)
print(data2.columns)

PetalWidthCm     0.893892
PetalLengthCm    0.063170
SepalWidthCm     0.021469
SepalLengthCm    0.021469
dtype: float64
Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')


DECISION TREE

In [None]:
X = data2[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
Y = data2['Species']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1)

In [None]:
dt2 = DecisionTreeClassifier(max_depth=100)
dt2.fit(X_train, Y_train)

In [None]:
dt_train_pred2 = dt2.predict(X_train)
train_acc2 = accuracy_score(Y_train, dt_train_pred2)
print(f"Training accuracy using decision tree: {train_acc2:.4f}")

Training accuracy using decision tree: 1.0000


In [None]:
dt_test_pred2 = dt2.predict(X_test)
test_acc2 = accuracy_score(Y_test, dt_test_pred2)
print(f"Testing accuracy using decision tree: {test_acc2:.4f}")

Testing accuracy using decision tree: 0.9556


In [None]:
print("Decision Tree with Different max_depth Values")
for depth in range(1,10,1):
    dt = DecisionTreeClassifier(max_depth=depth)
    dt.fit(X_train, Y_train)
    dt_train_pred = dt.predict(X_train)
    dt_test_pred = dt.predict(X_test)
    train_acc = accuracy_score(Y_train, dt_train_pred)
    test_acc = accuracy_score(Y_test, dt_test_pred)
    print(f"max_depth = {depth}: Training Accuracy = {train_acc:.4f}, Testing Accuracy = {test_acc:.4f}")

Decision Tree with Different max_depth Values
max_depth = 1: Training Accuracy = 0.6952, Testing Accuracy = 0.6000
max_depth = 2: Training Accuracy = 0.9619, Testing Accuracy = 0.9556
max_depth = 3: Training Accuracy = 0.9810, Testing Accuracy = 0.9556
max_depth = 4: Training Accuracy = 1.0000, Testing Accuracy = 0.9556
max_depth = 5: Training Accuracy = 1.0000, Testing Accuracy = 0.9556
max_depth = 6: Training Accuracy = 1.0000, Testing Accuracy = 0.9556
max_depth = 7: Training Accuracy = 1.0000, Testing Accuracy = 0.9556
max_depth = 8: Training Accuracy = 1.0000, Testing Accuracy = 0.9556
max_depth = 9: Training Accuracy = 1.0000, Testing Accuracy = 0.9556


max_depth = 2: Training Accuracy = 0.9619, Testing Accuracy = 0.9556


RANDOM FOREST

In [None]:
random4 = RandomForestClassifier(n_estimators=100, max_depth=100)
random4.fit(X_train, Y_train)

In [None]:
random_train_pred4 = random4.predict(X_train)
train_acc4 = accuracy_score(Y_train, random_train_pred4)
print(f"Training accuracy for Random forest with feature importance: {train_acc4:.4f}")

Training accuracy for Random forest with feature importance: 1.0000


In [None]:
random_test_pred4 = random4.predict(X_test)
test_acc4 = accuracy_score(Y_test, random_test_pred4)
print(f"Testing accuracy for Random forest with feature importance: {test_acc4:.4f}")

Testing accuracy for Random forest with feature importance: 0.9556


In [None]:
print("Random Forest with Different n_estimators Values")
for estimators in range (1,10,1):
    rf = RandomForestClassifier(n_estimators=estimators, max_depth=5)
    rf.fit(X_train, Y_train)
    rf_train_pred = rf.predict(X_train)
    rf_test_pred = rf.predict(X_test)
    train_acc = accuracy_score(Y_train, rf_train_pred)
    test_acc = accuracy_score(Y_test, rf_test_pred)
    print(f"n_estimators = {estimators}: Training Accuracy = {train_acc:.4f}, Testing Accuracy = {test_acc:.4f}")

Random Forest with Different n_estimators Values
n_estimators = 1: Training Accuracy = 0.9714, Testing Accuracy = 0.9333
n_estimators = 2: Training Accuracy = 0.9810, Testing Accuracy = 0.9556
n_estimators = 3: Training Accuracy = 0.9905, Testing Accuracy = 0.9556
n_estimators = 4: Training Accuracy = 0.9810, Testing Accuracy = 0.9556
n_estimators = 5: Training Accuracy = 0.9905, Testing Accuracy = 0.9556
n_estimators = 6: Training Accuracy = 0.9905, Testing Accuracy = 0.9556
n_estimators = 7: Training Accuracy = 1.0000, Testing Accuracy = 0.9556
n_estimators = 8: Training Accuracy = 1.0000, Testing Accuracy = 0.9556
n_estimators = 9: Training Accuracy = 0.9905, Testing Accuracy = 0.9556
