In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
data = pd.read_stata('lendingclub_train.dta')
df = data.sample(frac=0.01)  # 데이터의 1%를 샘플링

# 샘플 데이터셋 저장
df.to_stata('lendingclub_sample.dta')

In [2]:
df.head()

Unnamed: 0,index,loan_amnt,funded_amnt,funded_amnt_inv,int_rate,installment,annual_inc,dti,delinq_2yrs,fico_range_low,...,mths_since_recent_revol_delinq2,mths_since_recent_revol_delinq3,mths_since_recent_revol_delinq4,mths_since_recent_revol_delinq5,mths_since_recent_revol_delinq6,mths_since_recent_revol_delinq7,mths_since_recent_revol_delinq8,mths_since_recent_revol_delinq9,mths_since_recent_revol_delinq10,mths_since_recent_revol_delinq11
594991,347779,15000,15000,15000.0,0.1274,503.54,90000.0,5.41,1,690,...,0,0,0,0,0,0,0,0,0,0
604879,1020130,12325,12325,12325.0,0.1561,297.18,65000.0,18.85,0,700,...,0,0,0,0,0,0,0,0,1,0
47185,32903,8500,8500,8500.0,0.1367,289.15,33400.0,34.03,0,670,...,0,0,0,0,0,0,0,0,0,0
819022,525963,5500,5500,5500.0,0.1212,183.0,54000.0,3.18,0,690,...,0,0,0,0,0,0,0,0,0,0
438829,487986,15000,15000,15000.0,0.1016,485.14,115000.0,18.01,0,750,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# 독립 변수 및 종속 변수 설정
X = df.drop('depvar', axis=1)
y = df['depvar']

# 데이터 표준화 (Standardization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 훈련 데이터와 검증 데이터, 테스트 데이터로 분할
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.2, random_state=121)
X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.2, random_state=121)

# L1 정규화를 적용한 로지스틱 회귀 모델 생성
logreg_model = LogisticRegression(penalty='l2', solver='lbfgs', fit_intercept=True, max_iter=200)

# 훈련 데이터로 모델 훈련
logreg_model.fit(X_train, y_train)

# 검증 데이터에 대한 예측 수행
y_valid_pred = logreg_model.predict(X_valid)

# 정확도 점수 계산
valid_accuracy = accuracy_score(y_valid, y_valid_pred)
print(f'Validation Accuracy: {valid_accuracy}')

# 테스트 데이터에 대한 예측 수행
y_test_pred = logreg_model.predict(X_test)

# 정확도 점수 계산
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f'Test Accuracy: {test_accuracy}')

Validation Accuracy: 0.9871336669049321
Test Accuracy: 0.9885714285714285
