In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import pandas as pd
df_train = pd.read_csv("/content/drive/MyDrive/LGaimers/train.csv") # 학습용 데이터
df_test = pd.read_csv("/content/drive/MyDrive/LGaimers/submission.csv") # 테스트 데이터(제출파일의 데이터)

In [8]:
df = pd.DataFrame(df_train['expected_timeline'])
conditions = ['less than 3 months', '3 months ~ 6 months', 'more than a year',
              '9 months ~ 1 year', '6 months ~ 9 months', 'less than 6 months', 'etc.', 'less_than_3_months','being followed up','being followed up.']
df = df[~df.isin(conditions)]
df.dropna(inplace = True)
df['new_index'] = df.index
df = df.set_index('new_index')
df.reset_index(level=0, inplace=True)
df.to_csv("/content/drive/MyDrive/LGaimers/expected_timeline.csv", index=False)
df['new_index'] = df.index
df = df.set_index('new_index')
df.reset_index(level=0, inplace=True)


In [9]:
def categorize_timeline_refined(description):
    description = description.lower()

    # Explicit matches and interpretations based on train.csv insights
    if 'less than 3 months' in description or 'less_than_3_months' in description:
        return 'less than 3 months'
    elif any(x in description for x in ['3 months ~ 6 months', '3_months_~_6_months']):
        return '3 months ~ 6 months'
    elif 'more than a year' in description or 'more then 3 months' in description:
        return 'more than a year'
    elif any(x in description for x in ['9 months - 1 year', '9_months_~_1_year']):
        return '9 months ~ 1 year'
    elif '6 months ~ 9 months' in description or '4/8 months' in description:
        return '6 months ~ 9 months'
    elif 'etc.' in description or 'being followed up' in description:
        return 'Uncategorized'

    # Direct matches based on refined insights
    elif any(keyword in description for keyword in ['less than 3 months', 'less_than_3_months', '< 3 months', 'short term', 'immediate']):
        return 'less than 3 months'
    elif any(keyword in description for keyword in ['3 months ~ 6 months', '3 to 6 months', '3-6 months', 'quarterly']):
        return '3 months ~ 6 months'
    elif any(keyword in description for keyword in ['more than a year', '> 1 year', 'long term']):
        return 'more than a year'
    elif any(keyword in description for keyword in ['9 months ~ 1 year', '9-12 months', '9 to 12 months']):
        return '9 months ~ 1 year'
    elif any(keyword in description for keyword in ['6 months ~ 9 months', '6-9 months', '6 to 9 months']):
        return '6 months ~ 9 months'

    # Keywords suggesting immediacy or short term without explicit time frame
    elif any(keyword in description for keyword in ['urgent', 'asap', 'soon', 'immediately']):
        return 'less than 3 months'

    # Keywords suggesting longer term without specifying 'year'
    elif any(keyword in description for keyword in ['future', 'planning', 'next year']):
        return 'more than a year'

    # Default to 'Uncategorized' if no keywords match


    # Handling additional patterns and keywords
    num_months = [int(s) for s in description.split() if s.isdigit()]
    if 'month' in description or 'months' in description:
        if num_months:
            num_months = num_months[0]  # Use the first found number
            if num_months < 3:
                return 'less than 3 months'
            elif 3 <= num_months < 6:
                return '3 months ~ 6 months'
            elif 6 <= num_months < 9:
                return '6 months ~ 9 months'
            elif 9 <= num_months <= 12:
                return '9 months ~ 1 year'
            else:
                return 'more than a year'
        else:
            # Default to 'Uncategorized' for descriptions with "month(s)" but no clear numeric value
            return 'Uncategorized'
    elif 'week' in description or 'day' in description or 'asked to call' in description or 'following' in description:
        return 'less than 3 months'
    elif 'year' in description:
        return 'more than a year'

    # Default category for entries that don't fit any specific category
    return 'Uncategorized'

# Re-apply the refined categorization function to the dfset
df['category_refined'] = df['expected_timeline'].apply(categorize_timeline_refined)

# Re-check the distribution of the refined categories and display a sample
category_distribution_refined = df['category_refined'].value_counts()
df_sample_categorized_refined = df.sample(10, random_state=4)

category_distribution_refined, df_sample_categorized_refined[['expected_timeline', 'category_refined']]


(Uncategorized          484
 less than 3 months      48
 more than a year        13
 3 months ~ 6 months     13
 9 months ~ 1 year        7
 6 months ~ 9 months      4
 Name: category_refined, dtype: int64,
                                      expected_timeline    category_refined
 340                                       budget issue       Uncategorized
 382                                         low budget       Uncategorized
 300  known partner and we have installed 3 video wa...       Uncategorized
 262                 customer need shortthrow projector       Uncategorized
 363                                6_months_~_9_months       Uncategorized
 157                                                rnr       Uncategorized
 120  less than 3 months ,meeting with the customer ...  less than 3 months
 139  discussed with client.they are looking for aio...       Uncategorized
 99                             demo aligned for client       Uncategorized
 68   discussed with client they 