In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s3e16/sample_submission.csv
/kaggle/input/playground-series-s3e16/train.csv
/kaggle/input/playground-series-s3e16/test.csv


In [2]:
train = pd.read_csv("/kaggle/input/playground-series-s3e16/train.csv")
test = pd.read_csv("/kaggle/input/playground-series-s3e16/test.csv")

y = train.pop("Age")
X = train

X.Sex.value_counts()

M    27084
I    23957
F    23010
Name: Sex, dtype: int64

In [3]:
# Import label encoder
from sklearn import preprocessing

# label_encoder object knows
# how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

# Encode labels in column 'species'.
X['Sex']= label_encoder.fit_transform(X['Sex'])

X['Sex'].unique()


array([1, 2, 0])

In [8]:
from sklearn.model_selection import train_test_split, KFold
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

import lightgbm as lgb
reg = lgb.LGBMRegressor()
reg.fit(X_train, y_train)

y_pred=reg.predict(X_test)

from sklearn.model_selection import cross_val_score
mae_A = cross_val_score(reg, X_train, y_train, scoring="neg_mean_absolute_error", cv=5)
print("Cross Val Score:", mae_A.mean())

from sklearn.metrics import mean_absolute_error as mae
MAE = mae(y_test, y_pred)
print("Test MAE", MAE)

Cross Val Score: -1.4154125284596912
Test MAE 1.3947136695266296


In [18]:
# X = X.drop('id', axis=1)
X.head()

Unnamed: 0,Sex,Length,Diameter,Height,Weight,Shucked Weight,Viscera Weight,Shell Weight
0,1,1.525,1.175,0.375,28.973189,12.728926,6.647958,8.348928
1,1,1.1,0.825,0.275,10.418441,4.521745,2.324659,3.40194
2,2,1.3875,1.1125,0.375,24.777463,11.3398,5.556502,6.662133
3,0,1.7,1.4125,0.5,50.660556,20.354941,10.991839,14.996885
4,1,1.25,1.0125,0.3375,23.289114,11.977664,4.50757,5.953395


In [19]:
X.shape, y.shape

((74051, 8), (74051,))

In [22]:
test_X = test.drop('id', axis=1)

test_X['Sex']= label_encoder.transform(test_X['Sex'])

test_X['Sex'].unique()


array([1, 0, 2])

In [24]:
from sklearn.metrics import mean_absolute_error

seed = 6789

model = lgb.LGBMRegressor(random_state=seed)

cv = KFold(n_splits=10, shuffle=True, random_state=seed)
splits = cv.split(X, y)

cv_scores_train = []
cv_scores_test = []
train_predictions = []
test_predictions = []
predictions = []

for i, (train_idx, test_idx) in enumerate(splits):
    print("="*30, f"FOLD: {(i+1):2d}", "="*30)
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    lgbm = model.fit(X_train, y_train)
    
    y_pred_train = lgbm.predict(X_train)
    y_pred_test = lgbm.predict(X_test)
    y_pred = lgbm.predict(test_X)
    
    train_predictions.append(y_pred_train)
    test_predictions.append(y_pred_test)
    predictions.append(y_pred)
    
    score_train = mean_absolute_error(y_train, y_pred_train)
    score_test = mean_absolute_error(y_test, y_pred_test)
    
    cv_scores_train.append(score_train)
    cv_scores_test.append(score_test)
    
    print(f"MODELO : {lgbm.__class__.__name__} : TRAIN: {score_train:.5f} TEST: {score_test:.5f}")
    
print("END")

MODELO : LGBMRegressor : TRAIN: 1.35379 TEST: 1.40467
MODELO : LGBMRegressor : TRAIN: 1.35508 TEST: 1.40273
MODELO : LGBMRegressor : TRAIN: 1.35668 TEST: 1.39925
MODELO : LGBMRegressor : TRAIN: 1.35779 TEST: 1.40086
MODELO : LGBMRegressor : TRAIN: 1.35727 TEST: 1.40308
MODELO : LGBMRegressor : TRAIN: 1.35612 TEST: 1.39801
MODELO : LGBMRegressor : TRAIN: 1.35600 TEST: 1.39670
MODELO : LGBMRegressor : TRAIN: 1.35359 TEST: 1.42429
MODELO : LGBMRegressor : TRAIN: 1.35743 TEST: 1.39568
MODELO : LGBMRegressor : TRAIN: 1.35736 TEST: 1.39648
END


In [25]:
for fold, (train_preds, test_preds) in enumerate(zip(train_predictions, test_predictions)):
    print(f"Fold {fold+1} - TRAIN: {train_preds[:5]}")
    print(f"Fold {fold+1} - TEST : {test_preds[:5]}")
    print()

Fold 1 - TRAIN: [10.62204708  9.32150611 13.66043274  8.95957488 11.91947377]
Fold 1 - TEST : [ 7.95186253 10.06652584  8.3495851   9.36414855 11.26478611]

Fold 2 - TRAIN: [10.58220807  7.86791208  9.26456162 13.87450987  8.88758621]
Fold 2 - TEST : [ 6.82693744 11.08276168 11.84082031 10.10788883 11.37235712]

Fold 3 - TRAIN: [10.55743387  7.83656176  9.30679273 13.98174467  8.87132335]
Fold 3 - TEST : [11.98973601 11.18487212 10.29619876 14.54476823  5.02629957]

Fold 4 - TRAIN: [10.61293293  7.86058119  9.34465542  8.87159091 10.0133705 ]
Fold 4 - TEST : [13.99583321  7.95926332  5.13154611 10.18899358 16.05421159]

Fold 5 - TRAIN: [ 7.86520197  9.31762964 13.3182348   8.90437063 10.00532194]
Fold 5 - TEST : [10.65387646 11.41040703  7.79135567 10.4781177   9.24827135]

Fold 6 - TRAIN: [10.53466791  7.8986331   9.27011848 13.83637137  8.83825968]
Fold 6 - TEST : [ 9.06388118 10.93320579  4.3658151   8.16456207 10.80193959]

Fold 7 - TRAIN: [10.54313212  7.89469523  9.30487366 13.89

In [27]:
predictions_average = np.mean(predictions, axis=0)

print(predictions_average[:5])

[ 7.53580299  7.77296664 10.79250904  9.5503505   7.52336249]


In [28]:
submission = pd.read_csv('/kaggle/input/playground-series-s3e16/sample_submission.csv')

In [29]:
submission['Age'] = np.array(np.round(predictions_average, 0), dtype=int)
submission

Unnamed: 0,id,Age
0,74051,8
1,74052,8
2,74053,11
3,74054,10
4,74055,8
...,...,...
49363,123414,9
49364,123415,8
49365,123416,13
49366,123417,10


In [30]:
submission.to_csv('crab_submission.csv', index=False)