In [124]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [125]:
import pandas as pd
df_train = pd.read_csv("/content/drive/MyDrive/LGaimers/train.csv") # 학습용 데이터
df_test = pd.read_csv("/content/drive/MyDrive/LGaimers/submission.csv") # 테스트 데이터(제출파일의 데이터)

In [126]:
df_original = pd.DataFrame(df_train['expected_timeline'])

# 괜찮은건 일단 빼놓자 보기 편하게
conditions = ['less than 3 months', '3 months ~ 6 months', 'more than a year',
              '9 months ~ 1 year', '6 months ~ 9 months', 'less than 6 months']
df = df_original[~df_original.isin(conditions)]
df.dropna(inplace = True)
# merge하기 위한 new_index
df['new_index'] = df.index
df = df.reset_index(drop=True)

In [127]:
def categorize_timeline_refined(description):
    description = description.lower()

    # Explicit matches and interpretations based on train.csv insights
    if 'less than 3 months' in description or 'less_than_3_months' in description:
        return 'less than 3 months'
    elif any(x in description for x in ['3 months ~ 6 months', '3_months_~_6_months']):
        return '3 months ~ 6 months'
    elif 'more than a year' in description or 'more then 3 months' in description:
        return 'more than a year'
    elif any(x in description for x in ['9 months - 1 year', '9_months_~_1_year']):
        return '9 months ~ 1 year'
    elif 'etc.' in description or 'being followed up' in description:
        return 'Uncategorized'
    elif any(keyword in description for keyword in ['less than 3 months', 'less_than_3_months', '< 3 months', 'short term', 'immediate']):
        return 'less than 3 months'
    elif any(keyword in description for keyword in ['3 months ~ 6 months', '3 to 6 months', '3-6 months', 'quarterly']):
        return '3 months ~ 6 months'
    elif any(keyword in description for keyword in ['more than a year', '> 1 year', 'long term','more_than_a_year']):
        return 'more than a year'
    elif any(keyword in description for keyword in ['9 months ~ 1 year', '9-12 months', '9 to 12 months']):
        return '9 months ~ 1 year'
    elif any(keyword in description for keyword in ['6 months ~ 9 months', '6-9 months', '6 to 9 months','6_months_~_9_months']):
        return '6 months ~ 9 months'
    return 'Uncategorized'

# Re-apply the refined categorization function to the dfset
df['category_refined'] = df['expected_timeline'].apply(categorize_timeline_refined)

df = df[['new_index','expected_timeline'	,'category_refined'	]]
#df_original = df_original.apply(categorize_timeline_refined)
df_original['new_index'] = df_original.index
df_original = df_original.reset_index(drop=True)

In [128]:
df.to_csv("/content/drive/MyDrive/LGaimers/expected_timeline.csv", index=False)

In [129]:
df['category_refined'].value_counts()

Uncategorized          723
less than 3 months      81
3 months ~ 6 months      9
9 months ~ 1 year        7
more than a year         6
6 months ~ 9 months      4
Name: category_refined, dtype: int64

In [130]:
df_original['expected_timeline'].value_counts().head(20)

less than 3 months                                                                                                    17250
3 months ~ 6 months                                                                                                    5026
more than a year                                                                                                       3023
9 months ~ 1 year                                                                                                      1101
6 months ~ 9 months                                                                                                    1098
less than 6 months                                                                                                      108
etc.                                                                                                                     95
less_than_3_months                                                                                                       76
being fo

In [131]:
# df와 df_original을 'new_index'를 기준으로 병합
# how='left'를 사용하여 df_original의 모든 행을 유지하고 해당하는 행에만 category_refined 값을 채웁니다.
df_merged = pd.merge(df_original, df[['new_index', 'category_refined']], on='new_index', how='left')
# category_refined의 NaN 값을 expected_timeline의 값으로 채움
df_merged['category_refined'] = df_merged['category_refined'].fillna(df_merged['expected_timeline'])

# 결과 확인
print(df_merged['category_refined'].value_counts())


less than 3 months     17331
3 months ~ 6 months     5035
more than a year        3029
9 months ~ 1 year       1108
6 months ~ 9 months     1102
Uncategorized            723
less than 6 months       108
Name: category_refined, dtype: int64


### 추가 전처맄

In [132]:
#noga = pd.read_csv('/content/drive/MyDrive/LGaimers/코드파일/expected_timeline_240216.csv', encoding = 'cp949')

In [133]:
#noganoga = pd.DataFrame(noga['nogada'].dropna(axis = 0))
#noganoga['new_index'] = noganoga.index
#noganoga = noganoga.reset_index(drop=True)

In [134]:
#noganoga.loc[~noganoga['nogada'].isin(['less than 3 months', '3 months ~ 6 months']), 'nogada'] = None

In [135]:
#noganoga['nogada'].value_counts()

In [136]:
#df_merged = pd.merge(df_merged, noganoga[['new_index', 'nogada']], on='new_index', how='left')

In [137]:
#df_merged['category_refined'].value_counts()

In [138]:
# nogada 열에서 NaN이 아닌 값들을 expected_timeline에 덮어쓰기
#df_merged.loc[df_merged['nogada'].notnull(), 'expected_timeline'] = df_merged['nogada']

# 결과 확인

In [139]:
nogada_dict = {26: 'less than 3 months',
 55: 'less than 3 months',
 56: 'less than 3 months',
 64: '3 months ~ 6 months',
 76: 'less than 3 months',
 82: 'less than 3 months',
 83: 'less than 3 months',
 108: 'less than 3 months',
 109: 'less than 3 months',
 111: 'less than 3 months',
 123: 'less than 3 months',
 135: 'less than 3 months',
 144: 'less than 3 months',
 158: 'less than 3 months',
 185: 'less than 3 months',
 191: 'less than 3 months',
 205: 'less than 3 months',
 208: '3 months ~ 6 months',
 407: 'less than 3 months',
 423: 'less than 3 months',
 506: '3 months ~ 6 months',
 540: '3 months ~ 6 months'}

In [140]:
# noganoga에서 Null이 아닌 nogada 값을 가지는 행에 대해 new_index를 키로, nogada 값을 value로 하는 딕셔너리 생성

# df_merged의 expected_timeline 열에서 딕셔너리의 키(new_index)에 해당하는 값들을 딕셔너리의 value(nogada 값)로 교체
df_merged['category_refined'] = df_merged['new_index'].map(nogada_dict).fillna(df_merged['category_refined'])

df_merged['category_refined'].value_counts()

less than 3 months     17336
3 months ~ 6 months     5036
more than a year        3028
9 months ~ 1 year       1107
6 months ~ 9 months     1101
Uncategorized            723
less than 6 months       108
Name: category_refined, dtype: int64