In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier

In [3]:
df = pd.read_csv('PimaIndians.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   pregnant   392 non-null    int64  
 1   glucose    392 non-null    int64  
 2   diastolic  392 non-null    int64  
 3   triceps    392 non-null    int64  
 4   insulin    392 non-null    int64  
 5   bmi        392 non-null    float64
 6   family     392 non-null    float64
 7   age        392 non-null    int64  
 8   test       392 non-null    object 
dtypes: float64(2), int64(6), object(1)
memory usage: 27.7+ KB


In [15]:
X = df.drop("test", axis = 1)
y = df['test']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)
scaler = StandardScaler()
lr = LogisticRegression()

X_train_std = scaler.fit_transform(X_train)
lr.fit(X_train_std, y_train)
X_test_std = scaler.transform(X_test)

y_pred = lr.predict(X_test_std)

# Prints accuracy metrics and feature coefficients
print("{0:.1%} accuracy on test set.".format(accuracy_score(y_test, y_pred))) 
print(dict(zip(X.columns, abs(lr.coef_[0]).round(2))))

#藉由相關係數的程度選擇較為重要的特徵來提升模型的精準度

78.5% accuracy on test set.
{'pregnant': 0.07, 'glucose': 1.25, 'diastolic': 0.0, 'triceps': 0.08, 'insulin': 0.14, 'bmi': 0.58, 'family': 0.28, 'age': 0.44}


In [24]:
#藉由RFE來遞迴尋找自己設定的特徵個數
rfe = RFE(estimator=LogisticRegression(max_iter = 1000), n_features_to_select=3, verbose=1)

# Fits the eliminator to the data
rfe.fit(X_train, y_train)

# Print the features and their ranking (high = dropped early on)
print(dict(zip(X.columns, rfe.ranking_)))

# Print the features that are not eliminated
print(X.columns[rfe.support_])

# Calculates the test set accuracy
acc = accuracy_score(y_test, rfe.predict(X_test))
print("{0:.1%} accuracy on test set.".format(acc)) 

Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
{'pregnant': 3, 'glucose': 2, 'diastolic': 6, 'triceps': 4, 'insulin': 5, 'bmi': 1, 'family': 1, 'age': 1}
Index(['bmi', 'family', 'age'], dtype='object')
73.4% accuracy on test set.


In [20]:
#接下來利用樹方面的模型來選取特徵
rf = RandomForestClassifier(random_state=0)
rf.fit(X_train,y_train)

# Calculate the accuracy
acc = accuracy_score(y_test, rf.predict(X_test))

# Print the importances per feature
print(dict(zip(X.columns, rf.feature_importances_.round(2))))

# Print accuracy
print("{0:.1%} accuracy on test set.".format(acc))

# Create a mask for features importances above the threshold
mask = rf.feature_importances_ > 0.15

# Apply the mask to the feature dataset X
reduced_X = X.loc[:,mask]

# prints out the selected column names
print(reduced_X.columns)

{'pregnant': 0.08, 'glucose': 0.26, 'diastolic': 0.08, 'triceps': 0.08, 'insulin': 0.13, 'bmi': 0.13, 'family': 0.1, 'age': 0.14}
74.7% accuracy on test set.
Index(['glucose'], dtype='object')


In [21]:
#接下來一樣利用遞迴的方式來選取特徵，step設成2代表每次刪減2個特徵
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=2, step = 2, verbose=1)

# Fit the model to the training data
rfe.fit(X_train, y_train)

# Create a mask
mask = rfe.support_

# Apply the mask to the feature dataset X and print the result
reduced_X = X.loc[:, mask]
print(reduced_X.columns)

Fitting estimator with 8 features.
Fitting estimator with 6 features.
Fitting estimator with 4 features.
Index(['glucose', 'insulin'], dtype='object')
