# SQLiteによるSQL練習
Pandasによるデータ操作を参考にてSQLを習得

In [1]:
import sqlite3
import numpy as np
import pandas as pd
from sklearn import datasets, tree
from dtreeviz.trees import dtreeviz
from dtreeplt import dtreeplt

In [2]:
df_iris = pd.read_csv('./data/csv/iris.csv')

## 使用方法
- DBを指定（sample.db）
- DataFrameのテーブルを用意（df_iris）
- to_sqlでCREATE & INSERT
- read_sql_queryでDataFrameに格納

In [3]:
conn = sqlite3.connect('./sample.db')
c = conn.cursor()

In [4]:
df_iris.to_sql('iris', conn, if_exists='replace', index=False)

In [5]:
sql = '''
select * 
    from iris 
'''

df = pd.read_sql_query(sql, conn)

In [6]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [7]:
c.execute("select * from sqlite_master where type='table'")
fetch = c.fetchall()
[print(i) for i in fetch]

('table', 'iris', 'iris', 2, 'CREATE TABLE "iris" (\n"sepal_length" REAL,\n  "sepal_width" REAL,\n  "petal_length" REAL,\n  "petal_width" REAL,\n  "species" TEXT\n)')


[None]

## to_flat_indexの挙動確認

In [8]:
df_tmp = df.groupby('species').agg(['min', 'max', 'sum'])
df_tmp.columns = df_tmp.columns.to_flat_index()
df_tmp

Unnamed: 0_level_0,"(sepal_length, min)","(sepal_length, max)","(sepal_length, sum)","(sepal_width, min)","(sepal_width, max)","(sepal_width, sum)","(petal_length, min)","(petal_length, max)","(petal_length, sum)","(petal_width, min)","(petal_width, max)","(petal_width, sum)"
species,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
setosa,4.3,5.8,250.3,2.3,4.4,171.4,1.0,1.9,73.1,0.1,0.6,12.3
versicolor,4.9,7.0,296.8,2.0,3.4,138.5,3.0,5.1,213.0,1.0,1.8,66.3
virginica,4.9,7.9,329.4,2.2,3.8,148.7,4.5,6.9,277.6,1.4,2.5,101.3


In [9]:
conn.close()

## 決定木
- dtreeviz (https://qiita.com/calderarie/items/e4321bff95ac3042601b)

In [10]:
model = tree.DecisionTreeClassifier(max_depth=3)
model.fit(np.array(df.iloc[:,0:4]), np.array(df.loc[:,['species']]))

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [11]:
iris = load_iris()
viz = dtreeviz(
    model,
    df.iloc[:,0:4].values,
    iris.target, # 文字列ラベルではなく数値ラベル
    target_name='variety',
    feature_names=list(df.iloc[:,0:4].columns),
    class_names=list(df['species'].unique()),
    # fancy=False # 数値情報が協調されるため定量把握しやすい
    # orientation='LR' # 横向きにしたい場合は'LR'を指定する
    # histtype='bar' # barにするとわかりやすい
    # show_node_labels=True # ノードが指定されるのでコミュニケーション取りやすい
) 

display(viz)

NameError: name 'load_iris' is not defined

In [None]:
X = iris.data[29]

viz = dtreeviz(
    model,
    df.iloc[:,0:4].values,
    iris.target,
    target_name='variety',
    feature_names=list(df.iloc[:,0:4].columns),
    class_names=list(df['species'].unique()),   
    X=X,
) 

display(viz)