# Import and Read the Data

In [9]:
from typing import Any
import numpy as np
import pandas as pd

In [2]:
TRAIN_FILE = "./train.csv"
TEST_FILE = "./submission.csv"

df_train = pd.read_csv(TRAIN_FILE)
df_test = pd.read_csv(TEST_FILE)
df_submission = pd.read_csv(TEST_FILE)

X, y = df_train.drop("is_converted", axis=1), df_train["is_converted"]
X_test = df_test.drop(["is_converted", "id"], axis=1)

# Basic Text Cleaning

## Checking data type

In [10]:
def check_data_type(data: Any) -> Any:
    return type(data)

## Normalizing Case

In [51]:
def lower_case(text: str) -> str:
    return text.lower()

## White-Space split

In [107]:
def white_space_split(sentence: Any) -> set:
    return set(str(sentence).split())

## Removing Punctuation

In [115]:
import re
import string

def remove_punctuation(text: str) -> str:
    re_punc = re.compile('[{}]'.format(string.punctuation))
    return re_punc.sub('', text)

## How many values

In [111]:
et = X['expected_timeline'].ffill().bfill().apply(lower_case).apply(remove_punctuation)
et.loc[et.isin(et.value_counts().head(20).index)]

new_df = pd.DataFrame({
    "value_counts": et.value_counts().head(30).tolist(),
    "kidns": et.value_counts().head(30).index.tolist()    
})
new_df

Unnamed: 0,value_counts,kidns
0,34568,less than 3 months
1,10710,3 months 6 months
2,6241,more than a year
3,4128,6 months 9 months
4,1609,9 months 1 year
5,234,45 days
6,156,being followed up
7,152,already touch with customers
8,135,etc
9,129,lessthan3months


## Categorization Timeline

In [132]:
def categorize_timeline(text):
    if 'less than 3 months' in text:
        return '6 months'
    
    elif ('3 months' in text and '6 months' in text) or '3months6months' in text:
        return '6 months'
    
    elif '6 months' in text and '9 months' in text:
        return '1 year'
    
    elif ('9 months' in text and '1 year' in text) or '9months1year' in text:
        return '1 year'
    
    elif 'more than a year' in text:
        return 'over year'
    
    elif 'no requirement' in text or 'not' in text or 'no' in text:
        return 'Other'
    
    else:
        return 'Other'

## Preprocessing expected_timeline

In [137]:
def preprocessing_expected_timeline(df):
    df = df['expected_timeline'].ffill().bfill().fillna("0")
    df = df.apply(lower_case).apply(remove_punctuation)
    return df.apply(categorize_timeline)

In [140]:
X['expected_timeline'] = preprocessing_expected_timeline(X)
X['expected_timeline']

0         6 months
1         6 months
2         6 months
3         6 months
4         6 months
           ...    
59294     6 months
59295       1 year
59296     6 months
59297    over year
59298     6 months
Name: expected_timeline, Length: 59299, dtype: object