# 导入所需的库
导入必要的库，包括 pandas、sklearn 等。

In [None]:
# Importing required libraries

# pandas for data manipulation and analysis
import pandas as pd


# matplotlib for data visualization
import matplotlib.pyplot as plt


# train_test_split for splitting the data into training and testing sets
from sklearn.model_selection import train_test_split

# DecisionTreeClassifier for decision tree classification
from sklearn.tree import DecisionTreeClassifier

# accuracy_score for evaluating the model
from sklearn import metrics



# 加载糖尿病数据集
使用pandas或者从sklearn数据集中加载糖尿病数据集。

In [None]:
# Load the Diabetes Dataset


col_names = ['pregnant', 'glucose', 'bp', 'skin', 'insulin', 'bmi', 'pedigree', 'age', 'label']

# load dataset
diabetes_df = pd.read_csv("diabetes.csv", header=None, names=col_names)


# Display the first 5 rows of the DataFrame
diabetes_df.head()

# 探索性数据分析
显示数据框中的行数和列数。
显示每列的数据类型。
显示每列缺失值的数量。
显示每列的唯一值数量。
显示每列的基本统计信息。

In [None]:
# Exploratory Data Analysis

# Display the first 5 rows of the DataFrame
diabetes_df.head()

# Display the data types of each column.
diabetes_df.dtypes

# Display the number of missing values in each column.
diabetes_df.isnull().sum()

# Display the number of unique values in each column.
diabetes_df.nunique()

# Display the summary statistics of the dataframe.
diabetes_df.describe()



# 特征选择

选择您想要用于预测的特征。您可以使用所有特征或特征的子集。

In [None]:
# feature Selection

#split dataset in features and target variable
feature_cols = ['pregnant', 'insulin', 'bmi', 'age','glucose','bp','pedigree']

X = diabetes_df[feature_cols] # Features
y = diabetes_df.label # Target variable


# 数据分割
将数据分割为训练集和测试集。训练集将用于训练模型，而测试集将用于评估模型。

In [None]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test


# 构建决策树模型
使用训练集构建决策树模型。

In [None]:
# Building Decision Tree Model

# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)


# 模型评估
使用测试集对模型进行评估。

In [None]:
# Evaluating Model

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))


# 可视化决策树

使用graphviz库来可视化决策树。

```bash
pip install graphviz
pip install pydotplus
```

In [None]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('diabetes.png')
Image(graph.create_png())