In [2]:
#check 2
import pandas as pd

In [3]:
df = pd.read_pickle('Tables/loaded_data.pkl')

In [4]:
# Filter to projects only with successful or failed state for training purposes
df = df[((df.state == 'successful') | (df.state == 'failed'))]

In [5]:
# Drop columns without distinct values (found from profiling report)
df = df.drop(columns=['disable_communication','friends','is_backing','is_starred','permissions'])

In [6]:
# Drop highly correlated columns (ex. currency and currency symbol)
df = df.drop(columns=['country_displayable_name','currency_symbol','currency_trailing_code','static_usd_rate','usd_exchange_rate','usd_type'])

In [7]:
# Drop columns that are not known at the beginning of a project
df = df.drop(columns=['backers_count','converted_pledged_amount','pledged','spotlight','state_changed_at','usd_pledged'])

In [8]:
# Drop columns irrelevant due to business knowledge
df = df.drop(columns=['currency','current_currency','fx_rate','slug','source_url','location_city'])

In [9]:
# Drop columns due to difficulty to model (potentially NLP future additions)
df = df.drop(columns=['blurb','creator','name'])

### Feature Engineering

Synthetic Features to create
- Campaign Length (start to campaign)
- Creation to Start
- Month of Launch
- Hour of Launch


In [10]:
df['campaign_length'] = (df['deadline'] - df['launched_at']).dt.days
df['prep_time'] = (df['launched_at'] - df['created_at']).dt.days
df['month_of_launch'] = df['launched_at'].dt.month
df['weekday_of_launch'] = df['launched_at'].dt.dayofweek
df['hour_of_launch'] = df['launched_at'].dt.hour

In [11]:
# Drop date columns due to difficulty for interpretation for ML
df = df.drop(columns=['deadline','created_at','launched_at'])

### Encoding

In [16]:
# Encode Data
from evalml.pipelines.components.transformers import OneHotEncoder

number_of_categories = 10
ohe = OneHotEncoder(top_n=number_of_categories)

ImportError: cannot import name 'DiversityScore' from 'featuretools.primitives' (/anaconda/envs/azureml_py310_sdkv2/lib/python3.10/site-packages/featuretools/primitives/__init__.py)

In [13]:
def encode(df, column_to_encode):
    ohe.fit(df[column_to_encode])
    return ohe.transform(df[column_to_encode])

In [14]:
# Columns outside top_n receive an other value for encoding purposes
columns_to_encode = ['location_state','country','category','staff_pick', 'is_starrable']

for column in columns_to_encode:
    top_cats = df[column].value_counts().head(number_of_categories).index.tolist()
    df.loc[~df[column].isin(top_cats), column] = 'Other'

In [15]:
encoded_df = encode(df, columns_to_encode)
df = df.drop(columns=columns_to_encode)
encoded_df['id'] = df['id']
feature_df = pd.merge(df, encoded_df, on='id')

In [15]:
target_df = feature_df.state
feature_df = feature_df.drop(columns=['state'])

In [16]:
feature_df.to_pickle('Tables/feature_df.pkl')
target_df.to_pickle('Tables/target_df.pkl')

In [17]:
feature_df.to_csv('Tables/feature_df.csv')
target_df.to_csv('Tables/target_df.csv')

In [None]:
from pandas_profiling import ProfileReport

profile = ProfileReport(feature_df)
profile.to_file(output_file='Feature Profile.html')