In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Ver4 Data
train = pd.read_csv("../input/2021ai-project-startup-success-prediction/startup_train.csv")
test = pd.read_csv("../input/2021ai-project-startup-success-prediction/startup_test.csv")

train.head()

In [None]:
for col in train.columns:
    print(col, len(train[col].unique()))

In [None]:
train.info()

In [None]:
train = train.drop(['id', 'state_code'], axis = 1)
test = test.drop(['id', 'state_code'], axis = 1)
print(train.shape)

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train = train[train['first_funding_at'] >= '1980-01-01']

In [None]:
# test 데이터의 결측치 최빈값으로 채우기
# need_fillna_test = ['founded_at', 'first_funding_at', 'market', 'city', 'country_code']
# for col in need_fillna_test:
#     test[col] = test[col].fillna(test[col].mode()[0])

need_fillna_test = ['market', 'city', 'country_code']

for col in need_fillna_test:
    test[col] = test[col].fillna(test[col].mode()[0])
    train[col] = train[col].fillna(train[col].mode()[0])
    
# idea1 => 'first_funding_at' column의 결측치
train['first_funding_at'] = train['first_funding_at'].fillna(-1)
test['first_funding_at'] = test['first_funding_at'].fillna(-1)

In [None]:
# 남아있는 결측치가 존재하는지 확인
test.info()

In [None]:
# 첫투자와 마지막 투자 간의 시간 계산하여 column 추가
t1 = pd.to_datetime(train['first_funding_at'])
t2 = pd.to_datetime(train['last_funding_at'])
train['funding_duration'] = t2 - t1
train['funding_duration'] = train['funding_duration'].dt.days

t1 = pd.to_datetime(test['first_funding_at'])
t2 = pd.to_datetime(test['last_funding_at'])

test['funding_duration'] = t2 - t1
test['funding_duration'] = test['funding_duration'].dt.days

In [None]:
need_replace_year = ['founded_at','first_funding_at', 'last_funding_at']
# need_replace_year = ['founded_at']

def rep_N_only_year (tmp):
    if tmp == -1:
        return np.nan
    else:
        tmp = tmp.replace('/', '-')
        ar = tmp.split('-')
        for i in range(len(ar)):
            if (int(ar[i]) // 1000) != 0:
                return int(ar[i])

for col in need_replace_year:
    train[col] = train[col].apply(rep_N_only_year)
    test[col] = test[col].apply(rep_N_only_year)
    
test.head()

In [None]:
x_train = train.drop(['name', 'status'], axis = 1)
y_train = train['status']
x_test = test.drop(['name'], axis = 1)

In [None]:
x_train.info()

In [None]:
x_test.info()

In [None]:
x_train.head()

In [None]:
x_test.head()

In [None]:
print(train.columns)

In [None]:
x_test['first_funding_at'] = x_test['first_funding_at'].fillna(x_test['first_funding_at'].mode()[0])

In [None]:
#'country_code', 'city', 'market' 만 라벨인코딩
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
need_le = { 'country_code', 'city', 'market'} 

for col in need_le:
    train_test = pd.concat([x_train[col], x_test[col]], axis = 0)
    le.fit(train_test)
    x_train[col] = le.transform(x_train[col])
    x_test[col] = le.transform(x_test[col])
    
y_train = le.fit_transform(y_train)

In [None]:
x_train.head()

In [None]:
import torch
import torch.cuda
import torch.optim as optim

torch.manual_seed(1)

In [None]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
sc = StandardScaler()
x_train = pd.DataFrame(sc.fit_transform(x_train))
x_test = pd.DataFrame(sc.fit_transform(x_test))

In [None]:
x_train = torch.FloatTensor(x_train.values).cuda()
y_train = torch.LongTensor(y_train).cuda()
x_test = torch.FloatTensor(x_test.values).cuda()

In [None]:
from torch.utils.data import DataLoader, TensorDataset
train_data = TensorDataset(x_train, y_train)
dataloader = DataLoader(train_data, batch_size = 100, shuffle = True, drop_last = True)

In [None]:
print(x_train.shape)

In [None]:
linear1 = torch.nn.Linear(x_train.shape[1], 512)
linear2 = torch.nn.Linear(512, 512)
linear3 = torch.nn.Linear(512, 512)
linear4 = torch.nn.Linear(512, 512)
linear5 = torch.nn.Linear(512, 4)

relu = torch.nn.ReLU()
dropout = torch.nn.Dropout(p = 0.3)


In [None]:
torch.nn.init.xavier_uniform_(linear1.weight)
torch.nn.init.xavier_uniform_(linear2.weight)
torch.nn.init.xavier_uniform_(linear3.weight)
torch.nn.init.xavier_uniform_(linear4.weight)
torch.nn.init.xavier_uniform_(linear5.weight)

In [None]:
model = torch.nn.Sequential(linear1, relu,
                            linear2, relu,
                            linear3, relu,
                            linear4, relu, 
                            linear5).cuda()

In [None]:
optimizer = optim.Adam(model.parameters(), lr = 0.001)

loss = torch.nn.CrossEntropyLoss()

In [None]:
total_batch = len(dataloader)
training_epoch = 10

for epoch in range(training_epoch):
    avg_cost = 0
    
    for X, Y in dataloader:
        X = X.cuda()
        Y = Y.cuda()
        
        optimizer.zero_grad()
        
        hy = model(X)
        cost = loss(hy, Y)
        cost.backward()
        optimizer.step()
        
        avg_cost += cost/total_batch
    
    print(epoch, avg_cost.item())
        

In [None]:
with torch.no_grad():
    hy = model(x_test)
    predict = torch.argmax(hy, dim = 1)
    predict = predict.detach().cpu().numpy().astype(int)
    print(predict)
    predict = le.inverse_transform(predict)
    print(predict)

In [None]:
submit = pd.read_csv("../input/2021ai-project-startup-success-prediction/submit.csv")
submit['status'] = predict

def is_closed(tmp):
    if tmp == 'closed':
        return 0
    else:
        return 1

submit['status'] = submit['status'].apply(is_closed)
submit.to_csv("make_baseline_ver4.csv", index = False)

In [None]:
print('finish')

In [None]:
# 저장양식
#  Baseline () [DATA_VER4]