### Importando as Bibliotecas que serão utilizadas

In [23]:
import pandas as pd
import numpy as np
import sklearn as sk

### Importando os Dados

In [24]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')
sample = pd.read_csv('sample_submission.csv')

train.head()

Unnamed: 0,id,age_first_funding_year,age_last_funding_year,age_first_milestone_year,age_last_milestone_year,relationships,funding_rounds,funding_total_usd,milestones,is_CA,...,is_consulting,is_othercategory,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,labels
0,719,10.42,13.09,8.98,12.72,4,3,4087500,3,1,...,0,0,1,1,0,0,0,0,1.0,0
1,429,3.79,3.79,,,21,1,45000000,0,0,...,0,0,0,0,0,1,0,0,1.0,1
2,178,0.71,2.28,1.95,2.28,5,2,5200000,2,1,...,0,1,1,0,1,0,0,0,1.0,0
3,197,3.0,5.0,9.62,10.39,16,2,14500000,2,0,...,0,0,0,1,0,1,0,0,2.0,1
4,444,0.66,5.88,6.21,8.61,29,5,70000000,4,1,...,0,0,0,0,1,1,1,1,2.8,1


### Limpeza e Tratamento de Valores Nulos

In [25]:
train.isnull().sum()


id                            0
age_first_funding_year       35
age_last_funding_year         9
age_first_milestone_year    138
age_last_milestone_year     111
relationships                 0
funding_rounds                0
funding_total_usd             0
milestones                    0
is_CA                         0
is_NY                         0
is_MA                         0
is_TX                         0
is_otherstate                 0
category_code                 0
is_software                   0
is_web                        0
is_mobile                     0
is_enterprise                 0
is_advertising                0
is_gamesvideo                 0
is_ecommerce                  0
is_biotech                    0
is_consulting                 0
is_othercategory              0
has_VC                        0
has_angel                     0
has_roundA                    0
has_roundB                    0
has_roundC                    0
has_roundD                    0
avg_part

In [26]:
age_cols = [
    'age_first_funding_year',
    'age_last_funding_year',
    'age_first_milestone_year',
    'age_last_milestone_year'
]

for col in age_cols:
    train[col] = train[col].fillna(0)
    test[col] = test[col].fillna(0)

train.isnull().sum()
test.isnull().sum()

id                          0
age_first_funding_year      0
age_last_funding_year       0
age_first_milestone_year    0
age_last_milestone_year     0
relationships               0
funding_rounds              0
funding_total_usd           0
milestones                  0
is_CA                       0
is_NY                       0
is_MA                       0
is_TX                       0
is_otherstate               0
category_code               0
is_software                 0
is_web                      0
is_mobile                   0
is_enterprise               0
is_advertising              0
is_gamesvideo               0
is_ecommerce                0
is_biotech                  0
is_consulting               0
is_othercategory            0
has_VC                      0
has_angel                   0
has_roundA                  0
has_roundB                  0
has_roundC                  0
has_roundD                  0
avg_participants            0
dtype: int64

### Codificação de Variáveis Categóricas

In [27]:
train['category_code'].unique()

array(['enterprise', 'advertising', 'photo_video', 'web', 'software',
       'games_video', 'mobile', 'sports', 'fashion', 'semiconductor',
       'security', 'biotech', 'hardware', 'public_relations', 'ecommerce',
       'other', 'social', 'cleantech', 'manufacturing', 'automotive',
       'network_hosting', 'messaging', 'music', 'travel', 'consulting',
       'search', 'finance', 'medical', 'analytics', 'real_estate',
       'education', 'health', 'news', 'transportation'], dtype=object)

In [28]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe.fit(train[['category_code']])

train_cat_encoded = ohe.transform(train[['category_code']])
test_cat_encoded = ohe.transform(test[['category_code']])

cols_encoded = ohe.get_feature_names_out(['category_code'])
train_cat_df = pd.DataFrame(train_cat_encoded, columns=cols_encoded, index=train.index)
test_cat_df = pd.DataFrame(test_cat_encoded, columns=cols_encoded, index=test.index)

train = pd.concat([train.drop(columns=['category_code']), train_cat_df], axis=1)
test = pd.concat([test.drop(columns=['category_code']), test_cat_df], axis=1)
