# PRODIGY INFOTECH

# Build a decision tree classifier to predict whether a customer will purchase a product or service based on their demographic and behavioral data. 

# Import libraries

In [18]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Data Pre-processing

In [19]:
df=pd.read_csv('diamonds.csv')

In [20]:
df

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.20,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
49995,0.72,Ideal,D,SI1,60.8,57.0,2757,5.75,5.76,3.50
49996,0.72,Good,D,SI1,63.1,55.0,2757,5.69,5.75,3.61
49997,0.70,Very Good,D,SI1,62.8,60.0,2757,5.66,5.68,3.56
49998,0.86,Premium,H,SI2,61.0,58.0,2757,6.15,6.12,3.74


In [21]:
df.shape

(50000, 10)

In [22]:
print(list(df.columns))

['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y', 'z']


In [23]:
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,0.799444,61.753006,57.45783,3944.80544,5.734403,5.737956,3.541056
std,0.475173,1.431088,2.232092,3997.938105,1.123077,1.145579,0.707065
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,951.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2410.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5351.0,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    50000 non-null  float64
 1   cut      50000 non-null  object 
 2   color    50000 non-null  object 
 3   clarity  50000 non-null  object 
 4   depth    50000 non-null  float64
 5   table    50000 non-null  float64
 6   price    50000 non-null  int64  
 7   x        50000 non-null  float64
 8   y        50000 non-null  float64
 9   z        50000 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 3.8+ MB


# Handling Missing Values

In [25]:
df.isnull().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

In [26]:
df['color'].value_counts()

G    10452
E     9085
F     8864
H     7711
D     6224
I     5058
J     2606
Name: color, dtype: int64

In [27]:
df['carat'].unique()

array([0.23, 0.21, 0.29, 0.31, 0.24, 0.26, 0.22, 0.3 , 0.2 , 0.32, 0.33,
       0.25, 0.35, 0.42, 0.38, 0.7 , 0.86, 0.71, 0.78, 0.96, 0.73, 0.8 ,
       0.75, 0.74, 0.81, 0.59, 0.9 , 0.91, 0.61, 0.77, 0.63, 0.76, 0.64,
       0.72, 0.79, 0.58, 1.17, 0.83, 0.54, 0.98, 0.52, 1.01, 0.53, 0.51,
       1.05, 0.55, 0.87, 1.  , 0.57, 0.82, 0.6 , 1.04, 0.93, 1.2 , 0.99,
       0.34, 0.43, 0.36, 0.95, 0.89, 1.02, 0.97, 0.56, 0.85, 0.92, 1.27,
       0.84, 1.12, 1.03, 0.62, 0.66, 1.22, 1.08, 0.5 , 1.19, 0.39, 0.65,
       0.68, 1.24, 1.5 , 0.27, 0.41, 1.13, 1.06, 0.69, 0.88, 0.4 , 1.14,
       0.94, 1.29, 1.52, 1.16, 1.21, 1.23, 1.09, 0.67, 1.11, 1.1 , 1.18,
       1.15, 1.25, 1.07, 1.28, 0.28, 0.37, 1.31, 1.51, 1.26, 1.39, 1.35,
       1.3 , 1.32, 1.41, 1.36, 1.34, 1.44, 1.54, 1.45, 1.38, 1.33, 1.74,
       1.64, 1.47, 1.4 , 1.55, 1.95, 2.  , 1.37, 1.83, 1.62, 1.57, 1.69,
       2.06, 1.72, 1.66, 2.14, 1.49, 1.46, 2.15, 1.96, 2.22, 1.7 , 1.85,
       2.01, 2.27, 1.68, 1.56, 1.65, 1.82, 2.03, 1.

# Prepare the data

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [29]:
#step 1  Separate features and target variable
X = df.drop(columns=['carat'])
y = df['carat']

In [30]:
# Step 2: One-hot encode categorical features
X_encoded = pd.get_dummies(X)

In [31]:
# Step 3: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)


In [32]:
# Step 4: Create and train the decision tree classifier# Step 3: Create and train the decision tree regressor
regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(X_train, y_train)


DecisionTreeRegressor(random_state=42)

In [33]:
# Step 5: Make predictions on the test set
y_pred = regressor.predict(X_test)

In [34]:
# Step 6: Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")

Mean Squared Error: 0.00
R-squared: 0.99


In [35]:
X_test[0:10]

Unnamed: 0,depth,table,price,x,y,z,cut_Fair,cut_Good,cut_Ideal,cut_Premium,...,color_I,color_J,clarity_I1,clarity_IF,clarity_SI1,clarity_SI2,clarity_VS1,clarity_VS2,clarity_VVS1,clarity_VVS2
33553,61.2,57.2,931,5.44,5.49,3.34,0,1,0,0,...,0,0,1,0,0,0,0,0,0,0
9427,64.1,60.0,4744,6.11,6.06,3.9,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
199,62.2,56.0,2781,5.83,5.88,3.64,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
12447,60.7,58.0,5531,6.72,6.75,4.09,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
39489,63.1,55.0,505,4.33,4.36,2.74,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
42724,60.6,59.0,1743,5.33,5.29,3.22,0,0,0,1,...,1,0,0,0,0,0,0,0,1,0
10822,61.7,56.0,596,4.48,4.53,2.78,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
49498,62.7,57.0,2668,5.82,5.86,3.66,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4144,64.5,58.0,3627,6.0,5.96,3.86,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
36958,61.3,56.0,1103,4.82,4.81,2.95,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1


In [36]:
y_test[0:15]

33553    0.61
9427     0.91
199      0.77
12447    1.13
39489    0.32
42724    0.56
10822    0.34
49498    0.76
4144     0.90
36958    0.42
43106    0.51
38695    0.43
6188     0.27
1414     0.74
18471    1.40
Name: carat, dtype: float64