## Import Libraries

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


## Loading Dataset

In [2]:
df=pd.read_csv("MLData.csv")
columns=["Timestamp","Level","Class","Scale","Gender","Age","Residence","RelationshipStatus","FinanceState","CopeWithInstitute","RelationWithFamily","Pressure","AcademicResult","LivingPlace","SupportedBy","SocialMediaIn6","InferiorityComplex","MealSatisfaction","Health","OtherPositiveActivity","SleepTime"]
df.columns=columns
df = df.sample(frac=1).reset_index(drop=True)

In [4]:
Class_Status = df["Class"]
Class_Scale = df["Scale"]
df.drop(["Class","Scale","Timestamp"],axis=1,inplace=True)

## Encoder

In [5]:
columns=["Level","Gender","Residence","RelationshipStatus","FinanceState","CopeWithInstitute","RelationWithFamily","Pressure","AcademicResult","LivingPlace","SupportedBy","SocialMediaIn6","InferiorityComplex","MealSatisfaction","Health","OtherPositiveActivity"]
df_Enc = pd.get_dummies(df,columns = columns)

## Own Implementation

In [6]:
x_train, x_test, y_train, y_test= train_test_split(df_Enc, Class_Status, test_size= 0.1, random_state=1) 

In [8]:
from DecisionTree import DecisionTree 
dtf=DecisionTree()
dtf.fit(x_train,y_train)
y_pred = dtf.predict(x_train)
train_acc = accuracy_score(y_train, y_pred)
print("Training Accuracy: {}%".format(train_acc*100))

y_pred = dtf.predict(x_test)
train_acc = accuracy_score(y_test, y_pred)
print("Testing Accuracy: {}%".format(train_acc*100))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["label"]=y_train


Training Accuracy: 95.4954954954955%
Testing Accuracy: 57.89473684210527%


## Scikit Learn Decision Tree Classifier

In [9]:
Class_Status_enc=np.array(Class_Status)
Class_Scale_enc=np.array(Class_Scale)
Class_Status_enc = pd.get_dummies(Class_Status_enc)

In [10]:
x_train, x_test, y_train, y_test= train_test_split(df_Enc, Class_Status, test_size= 0.1, random_state=1) 

In [11]:
dtf = DecisionTreeClassifier(criterion = "entropy")

dtf.fit(x_train,y_train)

y_pred = dtf.predict(x_train)
train_acc = accuracy_score(y_train, y_pred)
print("Training Accuracy: {}%".format(train_acc*100))

y_pred = dtf.predict(x_test)
train_acc = accuracy_score(y_test, y_pred)
print("Testing Accuracy: {}%".format(train_acc*100))

Training Accuracy: 95.4954954954955%
Testing Accuracy: 52.63157894736842%


## Cross Validation

In [39]:
from sklearn.model_selection import cross_validate
dtf = DecisionTreeClassifier(criterion = "entropy")
cv_results = cross_validate(dtf, df_Enc, Class_Status_enc, cv=3)
sorted(cv_results.keys())
print(cv_results["test_score"])
print("Average Test Acc = {}".format(np.average(cv_results['test_score'])))

[0.54032258 0.45967742 0.5203252 ]
Average Test Acc = 0.5067750677506776
