In [1]:
# Decision Tree Algorithm ====>

# Decision Trees are non-parametric supervised learning models used for classification and regression
# tasks. They predict the value of a target variable by learning simple decision rules inferred from 
# the data features.

# How Decision Trees Work ===>>>>>>

# Tree Structure:

# Decision Trees represent decisions and their possible consequences in a tree-like structure.
# Each internal node of the tree represents a decision based on a feature.
# Each leaf node represents the outcome (class label or value) after following the decisions from 
# the root to that node.




# Feature Selection: At each node, the algorithm selects the feature that best splits the data 
# into subsets that are more homogeneous with respect to the target variable.

# For classification: Common metrics include Gini impurity and Information Gain (using entropy).
# For regression: Mean Squared Error (MSE) or Mean Absolute Error (MAE) are often used.
# Splitting: Once a feature is selected, the dataset is partitioned into subsets based on the 
# feature's values.

# Recursive Partitioning: The splitting process is applied recursively to each subset until one 
# of the stopping criteria is met (e.g., maximum depth of the tree, minimum number of samples per 
# leaf node, etc.).

# Stopping Criteria:

# Maximum Depth: Limit the depth of the tree to avoid overfitting.
# Minimum Samples per Leaf: Stop splitting nodes when the number of samples in a node falls below 
# a threshold.
# No Further Gain: If further splits do not provide significant improvement in homogeneity as 
# measured by the chosen metric.
# Prediction:

# Traversal: To classify a new instance or predict a value, traverse the tree from the root to 
# a leaf node based on the feature values of the instance.
# Output: The predicted class label or value at the leaf node reached by traversal is the output.

In [2]:
import numpy as np 
import pandas as pd 

In [3]:
df = pd.read_csv("C:\\Users\\saurabh\\Desktop\\Newdat\\covid_toy.csv")

In [4]:
df.head()

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,Male,103.0,Mild,Kolkata,No
1,27,Male,100.0,Mild,Delhi,Yes
2,42,Male,101.0,Mild,Delhi,No
3,31,Female,98.0,Mild,Kolkata,No
4,65,Female,101.0,Mild,Mumbai,No


In [5]:
df = df.dropna() 

In [6]:
from sklearn.preprocessing import LabelEncoder

In [7]:
lb = LabelEncoder() 

In [9]:
df['gender'] = lb.fit_transform(df['gender'])
df['cough'] = lb.fit_transform(df['cough'])
df['city'] = lb.fit_transform(df['city'])
df['has_covid'] = lb.fit_transform(df['has_covid'])

In [10]:
df.head(2)

Unnamed: 0,age,gender,fever,cough,city,has_covid
0,60,1,103.0,0,2,0
1,27,1,100.0,0,1,1


In [11]:
x = df.drop(columns = ['has_covid'] , axis = 1) ## Input column
y = df['has_covid']  ## Target column

In [12]:
from sklearn.model_selection import train_test_split 

In [14]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size = 0.2 , random_state = 42)

In [16]:
from sklearn.tree import DecisionTreeClassifier

In [17]:
dt = DecisionTreeClassifier()

In [18]:
dt.fit(x_train , y_train) 

DecisionTreeClassifier()

In [19]:
y_pred = dt.predict(x_test) 

In [20]:
from sklearn.metrics import accuracy_score

In [21]:
accuracy_score(y_test , y_pred)

0.3888888888888889

In [22]:
df = pd.read_csv("C:\\Users\\saurabh\\Desktop\\Newdat\\Social_Network_Ads.csv")

In [23]:
df.head(2) 

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19,19000,0
1,15810944,Male,35,20000,0


In [24]:
lb = LabelEncoder() 

In [25]:
df['Gender'] = lb.fit_transform(df['Gender'])

In [26]:
df.head(2) 

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,1,19,19000,0
1,15810944,1,35,20000,0


In [27]:
df=  df.dropna() 

In [29]:
x = df.drop(columns = ['Purchased'] , axis = 1)
y = df['Purchased']

In [31]:
from sklearn.model_selection import train_test_split 

In [32]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size = 0.2 , random_state = 42)

In [33]:
from sklearn.tree import DecisionTreeClassifier

In [34]:
dt = DecisionTreeClassifier() 

In [35]:
dt.fit(x_train , y_train) 

DecisionTreeClassifier()

In [36]:
y_pred = dt.predict(x_test) 

In [37]:
from sklearn.metrics import accuracy_score 

In [38]:
accuracy_score(y_test , y_pred) 

0.8875

In [39]:
df = pd.read_csv("C:\\Users\\saurabh\\Desktop\\Newdat\\tips.csv")

In [40]:
df.head(2) 

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3


In [41]:
lb = LabelEncoder()

In [43]:
df['sex'] = lb.fit_transform(df['sex'])
df['smoker'] = lb.fit_transform(df['smoker'])
df['day'] = lb.fit_transform(df['day'])
df['time'] = lb.fit_transform(df['time'])

In [44]:
df.head(2) 

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0,0,2,0,2
1,10.34,1.66,1,0,2,0,3


In [45]:
df = df.dropna() 

In [46]:
x = df.drop(columns = ['total_bill'] , axis = 1)  ### input data
y = df['total_bill']  ### target data 

In [47]:
from sklearn.model_selection import train_test_split 

In [50]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size = 0.2, random_state = 42)

In [53]:
from sklearn.tree import DecisionTreeRegressor

In [54]:
dt = DecisionTreeRegressor()

In [55]:
dt.fit(x_train , y_train) 

DecisionTreeRegressor()

In [57]:
y_pred = dt.predict(x_test) 

In [58]:
from sklearn.metrics import r2_score 

In [59]:
r2_score(y_test , y_pred)

0.4748110708996849

In [60]:
df = pd.read_csv("C:\\Users\\saurabh\\Desktop\\Newdat\\insurance.csv")

In [62]:
df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462


In [63]:
df = df.dropna() 

In [64]:
lb = LabelEncoder() 

In [65]:
df['sex'] = lb.fit_transform(df['sex'])
df['smoker'] = lb.fit_transform(df['smoker'])
df['region'] = lb.fit_transform(df['region'])

In [66]:
df.head(2) 

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523


In [67]:
x = df.drop(columns = ['charges'] , axis = 1)
y = df['charges']

In [68]:
from sklearn.model_selection import train_test_split 

In [69]:
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size = 0.2 , random_state = 42)

In [71]:
from sklearn.tree import DecisionTreeRegressor 

In [72]:
dt = DecisionTreeRegressor()

In [73]:
dt.fit(x_train , y_train) 

DecisionTreeRegressor()

In [74]:
y_pred = dt.predict(x_test) 

In [75]:
from sklearn.metrics import r2_score

In [76]:
r2_score(y_test , y_pred)

0.7139786380849354