In [None]:
import pandas as pd
import numpy as np
from datetime import datetime
from dateutil import parser

def load_data_from_csv():
    """
    1) 꾼.csv -> true_label=1
    2) 꾼아님.csv -> true_label=0
    """
    df_suspect = pd.read_csv("꾼.csv")
    df_suspect["true_label"] = 1
    
    df_nonsuspect = pd.read_csv("꾼아님.csv")
    df_nonsuspect["true_label"] = 0
    
    # 하나로 합치기
    df_all = pd.concat([df_suspect, df_nonsuspect], ignore_index=True)
    return df_all

def preprocess_credits_data(df):
    """
    - 날짜컬럼: datetime 변환
    - 금액컬럼: float 변환
    - openedAt이 없는 행 제거
    """
    # 날짜형 변환
    date_cols = ["openedAt", "dueAt", "lastRepaymentAt"]
    for c in date_cols:
        if c in df.columns:
            df[c] = pd.to_datetime(df[c], errors='coerce')

    # 금액컬럼 -> float
    num_cols = ["loanAmount", "repaymentAmount"]
    for c in num_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0)

    # openedAt이 없는(대출일 불명) 제거
    if "openedAt" in df.columns:
        df.dropna(subset=["openedAt"], inplace=True)

    return df

def calculate_user_features(user_df):
    """
    *질문에서 제시된 20개 변수와 관련된
    사용자별 특징값(피처)만 추출하여 dict로 반환.
    
    user_df: 특정 userId의 대출 이력(여러 행)
    
    반환 예시:
    {
      "userId": ...,
      "earlyRepaymentCount": ...,
      "earlyRepaymentRatio": ...,
      "loanAmtIncreaseMaxRatio": ...,
      "limitIncreaseLostCount": ...,
      "nUniqueInstitutions_6m": ...,
      ...
    }
    """
    features = {}
    features["userId"] = user_df["userId"].iloc[0]  # 동일 사용자
    
    # sort by openedAt (필요시)
    user_df = user_df.sort_values("openedAt").reset_index(drop=True)

    # --------------------------------------------------------------------------------
    # (1) 조기상환(early repayment) 지표:
    #     - 조기상환 횟수, 조기상환 비율
    # --------------------------------------------------------------------------------
    earlyRepayCond = (
        user_df["lastRepaymentAt"].notnull() &
        user_df["dueAt"].notnull() &
        (user_df["lastRepaymentAt"] < user_df["dueAt"])
    )
    earlyRepaymentCount = sum(earlyRepayCond)
    totalRepaidCases = sum(user_df["lastRepaymentAt"].notnull() & user_df["dueAt"].notnull())
    
    earlyRepaymentRatio = 0.0
    if totalRepaidCases > 0:
        earlyRepaymentRatio = earlyRepaymentCount / totalRepaidCases

    features["earlyRepaymentCount"] = earlyRepaymentCount
    features["earlyRepaymentRatio"]  = earlyRepaymentRatio

    # --------------------------------------------------------------------------------
    # (2) 대출 금액 증가 패턴:
    #     - 이전 loanAmount 대비 증가율의 최대값 (ex. max ratio)
    # --------------------------------------------------------------------------------
    user_df["prevLoan"] = user_df["loanAmount"].shift(1).replace(0, np.nan)
    user_df["amountRatio"] = user_df["loanAmount"] / user_df["prevLoan"]
    loanAmtIncreaseMaxRatio = user_df["amountRatio"].max(skipna=True)
    if pd.isna(loanAmtIncreaseMaxRatio):
        loanAmtIncreaseMaxRatio = 1.0  # 증가 패턴 계산 불가 시 기본값

    features["loanAmtIncreaseMaxRatio"] = loanAmtIncreaseMaxRatio

    # --------------------------------------------------------------------------------
    # (3) 한도 증액 후 연체 발생(단순화):
    #     - 금액이 전보다 커진 상태(예: ratio>=1.2) + status='lost' 인 건수
    # --------------------------------------------------------------------------------
    if "status" in user_df.columns:
        limitIncreaseLostCount = sum(
            (user_df["amountRatio"] >= 1.2) &
            (user_df["status"]=="lost")
        )
    else:
        limitIncreaseLostCount = 0
    features["limitIncreaseLostCount"] = limitIncreaseLostCount

    # --------------------------------------------------------------------------------
    # (4) 금융기관 다양성:
    #     - 최근 6개월 내 distinct identifiedInstitution 수
    # --------------------------------------------------------------------------------
    if "identifiedInstitution" in user_df.columns:
        six_months_ago = pd.to_datetime("today") - pd.DateOffset(months=6)
        recent_institutions = user_df[user_df["openedAt"] >= six_months_ago]
        nUniqueInstitutions_6m = recent_institutions["identifiedInstitution"].nunique()
    else:
        nUniqueInstitutions_6m = 0
    features["nUniqueInstitutions_6m"] = nUniqueInstitutions_6m

    # --------------------------------------------------------------------------------
    # (5) 결정된 상환 형태 없음 (예시):
    #     - 만기일과 실제 상환일의 큰 차이(>7일) 빈도가 매우 높으면 "불명확"
    # --------------------------------------------------------------------------------
    # 여기서는 그 비율만 추출
    if "dueAt" in user_df.columns and "lastRepaymentAt" in user_df.columns:
        cond = user_df["dueAt"].notnull() & user_df["lastRepaymentAt"].notnull()
        mismatchDays = abs((user_df["dueAt"] - user_df["lastRepaymentAt"]).dt.days)
        mismatchCount = sum((mismatchDays > 7) & cond)
        mismatchRatio = mismatchCount / sum(cond) if sum(cond)>0 else 0
    else:
        mismatchRatio = 0
    features["repayMismatchRatio"] = mismatchRatio

    # --------------------------------------------------------------------------------
    # (6) 상환 대비 대출 비율:
    #     - sum(repaymentAmount)/sum(loanAmount)
    # --------------------------------------------------------------------------------
    totalLoan = user_df["loanAmount"].sum()
    totalRepay= user_df["repaymentAmount"].sum()
    repay2loanRatio = totalRepay/totalLoan if totalLoan>0 else 0
    features["repay2loanRatio"] = repay2loanRatio

    # --------------------------------------------------------------------------------
    # (7) 짧은 기간 내 다수 계좌 개설:
    #     - 최근 3개월 내 accountNumber 개수
    # --------------------------------------------------------------------------------
    if "accountNumber" in user_df.columns:
        three_months_ago = pd.to_datetime("today") - pd.DateOffset(months=3)
        recent_df = user_df[user_df["openedAt"] >= three_months_ago]
        # 고유 계좌 수 or 총 계좌 수, 여기서는 총 계좌 수
        manyAccounts3m = len(recent_df["accountNumber"].unique())
    else:
        manyAccounts3m = 0
    features["manyAccounts3m"] = manyAccounts3m

    # --------------------------------------------------------------------------------
    # (8) 비정상적 금리로 대출 (CSV에 금리 없으므로 repaymentAmount/loanAmount로 유추)
    #     - repayment/loan이 극단적(>3 or <0.5)인 횟수
    # --------------------------------------------------------------------------------
    cond_rate = (
        (user_df["loanAmount"] > 0) &
        ((user_df["repaymentAmount"]/user_df["loanAmount"] > 3) |
         (user_df["repaymentAmount"]/user_df["loanAmount"] < 0.5))
    )
    abnormalRateCount = sum(cond_rate)
    features["abnormalRateCount"] = abnormalRateCount

    # --------------------------------------------------------------------------------
    # (9) 상환 금액 패턴의 비일관성:
    #     - repaymentAmount 표준편차(단순)
    # --------------------------------------------------------------------------------
    repayStd = user_df["repaymentAmount"].std() if len(user_df)>1 else 0
    if pd.isna(repayStd):
        repayStd = 0
    features["repaymentStd"] = repayStd

    # --------------------------------------------------------------------------------
    # (10) 대출 계약 종료 즉시 새로운 대출 신청:
    #     - lastRepaymentAt -> 다음 openedAt 간격 중 7일 이하인 건수
    # --------------------------------------------------------------------------------
    user_df["gapRepayToNext"] = user_df["openedAt"].shift(-1) - user_df["lastRepaymentAt"]
    quickNewLoanCount = sum((user_df["gapRepayToNext"].dt.days <= 7) & user_df["gapRepayToNext"].notnull())
    features["quickNewLoanCount"] = quickNewLoanCount

    # --------------------------------------------------------------------------------
    # (11) 동일 금액의 반복적 상환:
    #     - repaymentAmount 동일 값이 3회 이상 등장하는지 여부
    # --------------------------------------------------------------------------------
    repay_counts = user_df["repaymentAmount"].value_counts()
    repeatedRepay = 1 if any(repay_counts >= 3) else 0
    features["repeatedRepay"] = repeatedRepay

    # --------------------------------------------------------------------------------
    # (12) 과거 대출 종료 후 새로운 대출 간격:
    #     - (lastRepaymentAt -> 다음 openedAt) 30일 이내인 횟수
    # --------------------------------------------------------------------------------
    closeGapCount_30 = sum((user_df["gapRepayToNext"].dt.days <= 30) & user_df["gapRepayToNext"].notnull())
    features["closeGapCount_30"] = closeGapCount_30

    # --------------------------------------------------------------------------------
    # (13) 단일 대출액 간 잦은 변화:
    #     - loanAmount 표준편차
    # --------------------------------------------------------------------------------
    loanStd = user_df["loanAmount"].std() if len(user_df)>1 else 0
    if pd.isna(loanStd):
        loanStd = 0
    features["loanStd"] = loanStd

    # --------------------------------------------------------------------------------
    # (14) 의무 상환기간 이전 상환 비율:
    #     - (lastRepaymentAt < dueAt) 비율 (조기상환비율) [중복이나 따로]
    # --------------------------------------------------------------------------------
    # 이미 (1)에서 "earlyRepaymentRatio"를 구했으니 여기서는 중복 가능
    # 그래도 명시적으로 한번 더 표시 가능:
    features["mandatoryEarlyRatio"] = earlyRepaymentRatio

    # --------------------------------------------------------------------------------
    # (15) 대출 신청-승인 후 상환까지의 급격한 단축:
    #     - (lastRepaymentAt - openedAt) 평균 혹은 "매우 짧은" 케이스 수
    # --------------------------------------------------------------------------------
    user_df["loanDuration"] = (user_df["lastRepaymentAt"] - user_df["openedAt"]).dt.days
    # 10일 미만이 몇 건?
    shortLoanCount = sum((user_df["loanDuration"] < 10) & (user_df["loanDuration"] >= 0))
    features["shortLoanCount"] = shortLoanCount

    # --------------------------------------------------------------------------------
    # (16) 자주 사용되는 금융기관 패턴:
    #     - 특정기관 편중도(최빈기관 횟수 / 전체)
    # --------------------------------------------------------------------------------
    if "identifiedInstitution" in user_df.columns:
        topInstCount = user_df["identifiedInstitution"].value_counts().max()
        totalLoans = len(user_df)
        usageConcentration = topInstCount/totalLoans if totalLoans>0 else 0
    else:
        usageConcentration = 0
    features["usageConcentration"] = usageConcentration

    # --------------------------------------------------------------------------------
    # (17) 대출 목적 코드 약화:
    #     - loanDescription 비어있는 건수
    # --------------------------------------------------------------------------------
    if "loanDescription" in user_df.columns:
        emptyDescCount = sum(user_df["loanDescription"].isnull() | (user_df["loanDescription"]==""))
    else:
        emptyDescCount = 0
    features["emptyDescCount"] = emptyDescCount

    # --------------------------------------------------------------------------------
    # (18) 신청 수 대비 항시 진행 대출 건의 비율:
    #     - status=performing인 건 / 전체
    # --------------------------------------------------------------------------------
    if "status" in user_df.columns:
        performingCount = sum(user_df["status"]=="performing")
        totalCount = len(user_df)
        performingRatio = performingCount/totalCount if totalCount>0 else 0
    else:
        performingRatio = 0
    features["performingRatio"] = performingRatio

    # --------------------------------------------------------------------------------
    # (19) 새로운 대출 시 기존 대출 보유 여부(대출기간 중첩):
    #     - (openedAt_i+1 < dueAt_i) etc. 실제 Overlap 로직은 더 정교해야 함
    # --------------------------------------------------------------------------------
    overlapCount = 0
    for i in range(len(user_df)-1):
        row_current = user_df.iloc[i]
        row_next = user_df.iloc[i+1]
        if pd.notnull(row_current.get("dueAt")) and pd.notnull(row_next.get("openedAt")):
            if row_next["openedAt"] < row_current["dueAt"]:
                overlapCount += 1
    features["overlapCount"] = overlapCount

    # --------------------------------------------------------------------------------
    # (20) 특정 시즌(연말/신년)에 몰려있는 대출 신청:
    #     - 12~1월 신청 비율
    # --------------------------------------------------------------------------------
    if "openedAt" in user_df.columns:
        seasonDF = user_df[
            (user_df["openedAt"].dt.month==12) | (user_df["openedAt"].dt.month==1)
        ]
        seasonRatio = len(seasonDF)/len(user_df) if len(user_df)>0 else 0
    else:
        seasonRatio = 0
    features["seasonRatio_12or1"] = seasonRatio

    return features


def build_feature_table(df):
    """
    전체 DataFrame을 userId별로 그룹핑하여
    calculate_user_features()로 만든 지표(피처)들을
    취합한 최종 DF를 만든다.
    """
    user_features = []
    for user_id, group_df in df.groupby("userId"):
        feats = calculate_user_features(group_df)
        user_features.append(feats)
    
    # user_features는 list of dict -> DataFrame
    feature_df = pd.DataFrame(user_features)
    return feature_df


if __name__ == "__main__":
    # 1) CSV에서 데이터 로드
    df_all = load_data_from_csv()

    # 2) 전처리
    df_all = preprocess_credits_data(df_all)

    # 3) 사용자별 피처(지표) 추출
    feature_df = build_feature_table(df_all)

    # 4) 결과 확인
    print(feature_df.head(10))
    print("\n== Feature Columns ==")
    print(feature_df.columns.tolist())
