In [54]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [55]:
sns.set_style('whitegrid')

In [56]:
data = pd.read_csv('preterm-and-very-preterm-births-by-raceethnicity-2010-2018.csv')

In [57]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162 entries, 0 to 161
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Year            162 non-null    int64  
 1   Race/Ethnicity  162 non-null    object 
 2   Birth Type      162 non-null    object 
 3   Total Births    162 non-null    int64  
 4   Events          162 non-null    int64  
 5   Percent         162 non-null    float64
 6   Upper 95% CI    162 non-null    float64
 7   Lower 95% CI    162 non-null    float64
dtypes: float64(3), int64(3), object(2)
memory usage: 10.2+ KB


In [58]:
data.head()

Unnamed: 0,Year,Race/Ethnicity,Birth Type,Total Births,Events,Percent,Upper 95% CI,Lower 95% CI
0,2010,African-American,Preterm Births,27634,3441,12.5,12.8,12.1
1,2010,African-American,Very Preterm Births,27634,731,2.6,2.8,2.5
2,2011,African-American,Preterm Births,26991,3241,12.0,12.4,11.6
3,2011,African-American,Very Preterm Births,26991,729,2.7,2.9,2.5
4,2012,African-American,Preterm Births,26446,3226,12.2,12.6,11.8


In [59]:
data['Race/Ethnicity'].value_counts()

American Indian     18
Multi-Race          18
Asian               18
Pacific Islander    18
TOTAL               18
African-American    18
Unknown             18
Hispanic            18
White               14
White/Other          4
Name: Race/Ethnicity, dtype: int64

In [60]:
data['Birth Type'].value_counts()

Very Preterm Births    81
Preterm Births         81
Name: Birth Type, dtype: int64

In [61]:
data.isna().sum()

Year              0
Race/Ethnicity    0
Birth Type        0
Total Births      0
Events            0
Percent           0
Upper 95% CI      0
Lower 95% CI      0
dtype: int64

In [62]:
birth = data["Birth Type"].unique()

In [63]:
data["Birth Type"] = data["Birth Type"].map({"Preterm Births":0, "Very Preterm Births": 1})

In [64]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162 entries, 0 to 161
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Year            162 non-null    int64  
 1   Race/Ethnicity  162 non-null    object 
 2   Birth Type      162 non-null    int64  
 3   Total Births    162 non-null    int64  
 4   Events          162 non-null    int64  
 5   Percent         162 non-null    float64
 6   Upper 95% CI    162 non-null    float64
 7   Lower 95% CI    162 non-null    float64
dtypes: float64(3), int64(4), object(1)
memory usage: 10.2+ KB


In [65]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

label = LabelEncoder()

In [66]:
data["Race/Ethnicity"] = label.fit_transform(data["Race/Ethnicity"])

In [67]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 162 entries, 0 to 161
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Year            162 non-null    int64  
 1   Race/Ethnicity  162 non-null    int32  
 2   Birth Type      162 non-null    int64  
 3   Total Births    162 non-null    int64  
 4   Events          162 non-null    int64  
 5   Percent         162 non-null    float64
 6   Upper 95% CI    162 non-null    float64
 7   Lower 95% CI    162 non-null    float64
dtypes: float64(3), int32(1), int64(4)
memory usage: 9.6 KB


In [68]:
data.describe()

Unnamed: 0,Year,Race/Ethnicity,Birth Type,Total Births,Events,Percent,Upper 95% CI,Lower 95% CI
count,162.0,162.0,162.0,162.0,162.0,162.0,162.0,162.0
mean,2014.0,4.024691,0.5,108951.679012,5347.037037,5.545062,5.693827,5.438889
std,2.589995,2.632691,0.50155,154352.75804,10129.296371,4.141564,4.240938,4.113442
min,2010.0,0.0,0.0,1487.0,14.0,0.8,0.5,0.7
25%,2012.0,2.0,0.0,9286.0,190.25,1.3,1.3,1.3
50%,2014.0,4.0,0.5,25160.0,1097.5,5.3,5.15,5.35
75%,2016.0,6.0,1.0,138373.0,5566.0,8.975,8.875,8.675
max,2018.0,9.0,1.0,508748.0,43527.0,12.7,13.4,12.7


In [69]:
sc = StandardScaler()

X = data[["Year", "Race/Ethnicity", "Total Births", "Events", "Percent", "Upper 95% CI", "Lower 95% CI"]]

In [70]:
X

Unnamed: 0,Year,Race/Ethnicity,Total Births,Events,Percent,Upper 95% CI,Lower 95% CI
0,2010,0,27634,3441,12.5,12.8,12.1
1,2010,0,27634,731,2.6,2.8,2.5
2,2011,0,26991,3241,12.0,12.4,11.6
3,2011,0,26991,729,2.7,2.9,2.5
4,2012,0,26446,3226,12.2,12.6,11.8
...,...,...,...,...,...,...,...
157,2018,9,122346,1169,1.0,1.0,0.9
158,2018,7,16244,1797,11.1,11.6,10.6
159,2018,7,16244,346,2.1,2.4,1.9
160,2018,6,453926,39778,8.8,8.8,8.7


In [71]:
Y = data["Birth Type"]

In [72]:
X_scaled = sc.fit_transform(X)

In [73]:
X_scaled.shape

(162, 7)

In [74]:
from sklearn.model_selection import train_test_split

In [75]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled,Y, test_size=0.25, shuffle=True)

In [76]:
X_train.shape

(121, 7)

In [77]:
X_test.shape

(41, 7)

In [78]:
y_test.shape, y_train.shape

((41,), (121,))

In [79]:
from sklearn.tree import DecisionTreeClassifier

In [80]:
dc = DecisionTreeClassifier(max_depth=3)

In [81]:
dc.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3)

In [82]:
pred = dc.predict(X_test)

In [83]:
dc.score(X_test, y_test)

1.0

In [84]:
from sklearn.metrics import roc_auc_score

In [85]:
roc_auc_score(y_test, pred)

1.0

In [86]:
from joblib import dump

In [87]:
dump(dc, "model.pkl")

['model.pkl']