In [15]:
import numpy as np
import pandas as pd
from xgboost import XGBRegressor


In [2]:
train = pd.read_csv('/content/train.csv')
test = pd.read_csv('/content/test.csv')


In [3]:
print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')


Train shape: (1624, 212)
Test shape: (406, 203)


In [4]:
train['is_train'] = True
test['is_train'] = False

df = pd.concat([train, test], axis=0, ignore_index=True)


In [6]:
# Binarize column
def binarize(df, col):
    uniques = list(df[col].dropna().unique())
    df[col] = [1 if val == uniques[0] else 0 for val in df[col]]
    return df

# One-hot encode column using sklearn
from sklearn.preprocessing import OneHotEncoder

def one_hot_encode_sklearn(df, col):
    data = df[[col]].values.reshape(-1, 1)
    encoder = OneHotEncoder(sparse_output=False)
    encoded_data = encoder.fit_transform(data)
    encoded_df = pd.DataFrame(encoded_data, columns=[f"{col}_{val}" for val in encoder.categories_[0]])
    df = df.drop(columns=[col])
    df = pd.concat([df, encoded_df], axis=1)
    return df

# Factorize column
def factorize(df, col):
    mapping = {name: i for i, name in enumerate(df[col].unique())}
    df[col] = df[col].map(mapping)
    return df


In [7]:
df = binarize(df, 'financialCurrency')
df = one_hot_encode_sklearn(df, 'sector')
df = one_hot_encode_sklearn(df, 'recommendationKey')
df = factorize(df, 'industry')


In [9]:
# Impute missing values by median
def impute_missing_by_median(df):
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    for col in df:
        if np.any(df[col].isnull()):
            df[col] = df[col].fillna(df[col].median())
    print(f'Missing values after imputation: {sum(df.isnull().sum())}')
    return df


In [10]:
df = impute_missing_by_median(df)


Missing values after imputation: 0


In [12]:
train = df[df['is_train'] == True].copy()
test = df[df['is_train'] == False].copy()


In [13]:
train = train.drop('is_train', axis=1)
test = test.drop('is_train', axis=1)


In [18]:
# Define target_names
target_names = [col for col in train.columns if col not in test.columns]

# Assuming 'targets_names' is a list of column names, we want to drop from 'test'
test = test.drop(target_names, axis=1)

# Copying the target columns from train before dropping them
targets = train[target_names].copy()

train = train.drop(target_names, axis=1)


In [19]:
model = XGBRegressor()
cross_val_score_results = {}

for target in targets:
    score = np.round(np.mean(cvs(model, train, targets[target], cv=3, scoring='r2')),2)
    cross_val_score_results[target] = score
    print(f'{target} -> {score}')


In [20]:
preds = {}
for target in targets:
    model.fit(train, targets[target])
    pred = model.predict(test)
    preds[target] = pred
    print(f'Finished train/predict for: {target}')


In [21]:
sub = pd.read_csv('/content/sample_submission.csv')

# Assuming 'preds' is a dictionary containing your predictions for each target
for target in preds:
    sub[target] = preds[target]

# Save the submission file locally
sub.to_csv('submission.csv', index=False)


In [22]:
from google.colab import files

files.download('submission.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [23]:
# Save your DataFrame (sub) to a CSV file
sub.to_csv('/content/submission.csv', index=False)


In [24]:
sub.to_csv('/content/submission.csv', index=False)
