In [4]:
import pandas as pd
import csv
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, classification_report
from sklearn.preprocessing import OrdinalEncoder, RobustScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import RandomForestRegressor

In [5]:
# Read training data from the csv file provided
train_original = pd.read_csv('atlantis_citizens_final.csv')
print(train_original.shape)
print(f"Original data read from the file:\n{train_original}\n")

# No need of Citizen_ID and Bio_Hash for training the model
train = train_original.drop(['Citizen_ID','Bio_Hash'],axis=1)

(15751, 10)
Original data read from the file:
      Citizen_ID       Diet_Type    District_Name Occupation  Wealth_Index  \
0      CIT_15935  Exotic Imports      Coral Slums     Scribe        1491.0   
1      CIT_11623         Seafood      Coral Slums     Fisher        1596.0   
2       CIT_8026         Seafood    Mariana Plaza    Warrior        3921.0   
3       CIT_0492  Exotic Imports      Deep Trench     Fisher           NaN   
4       CIT_0275         Seaweed      Deep Trench    Warrior       25985.0   
...          ...             ...              ...        ...           ...   
15746  CIT_10602         Seaweed    Mariana Plaza   Merchant        2896.0   
15747   CIT_0865         Seaweed      Coral Slums      Miner        1671.0   
15748   CIT_0809  Exotic Imports    Mariana Plaza     Scribe        4656.0   
15749  CIT_10750         Seafood  The Golden Reef   Merchant       17529.0   
15750   CIT_6381         Seaweed      Coral Slums    Warrior        1439.0   

       House_Size

In [6]:
# Encode strings to numbers, imputer works on only numeric data
encoder = OrdinalEncoder()
train_encoded = encoder.fit_transform(train)

# Impute missing values

# Initialize the IterativeImputer with a base estimator - RandomForestRegressor
imputer = IterativeImputer(estimator=RandomForestRegressor(), max_iter=5, random_state=0)

imputed_data = imputer.fit_transform(train_encoded)

# Decode back to original strings
train = pd.DataFrame(encoder.inverse_transform(imputed_data), columns=train.columns)

# Save the imputed values into csv file for manual comparison
train.to_csv("imputed.csv", index=False, quoting=csv.QUOTE_NONE)

print(f"After imputing:\n{train}\n")

After imputing:
            Diet_Type    District_Name Occupation Wealth_Index  \
0      Exotic Imports      Coral Slums     Scribe       1491.0   
1             Seafood      Coral Slums     Fisher       1596.0   
2             Seafood    Mariana Plaza    Warrior       3921.0   
3      Exotic Imports      Deep Trench     Fisher       1481.0   
4             Seaweed      Deep Trench    Warrior      25985.0   
...               ...              ...        ...          ...   
15746         Seaweed    Mariana Plaza   Merchant       2896.0   
15747         Seaweed      Coral Slums      Miner       1671.0   
15748  Exotic Imports    Mariana Plaza     Scribe       4656.0   
15749         Seafood  The Golden Reef   Merchant      17529.0   
15750         Seaweed      Coral Slums    Warrior       1439.0   

      House_Size_sq_ft Life_Expectancy Vehicle_Owned    Work_District  
0                100.0            42.0   Fin Bicycle    Mariana Plaza  
1                100.0            49.0   Sea Sc



In [7]:
# Fix outliers and scale the numeric values
scaler = RobustScaler()

# The scaler expects a 2D array, so we reshape the column
# Save the scaled data in new columns
train['Wealth_Index_Scaled'] = scaler.fit_transform(train[['Wealth_Index']])
train['House_Size_sq_ft_Scaled'] = scaler.fit_transform(train[['House_Size_sq_ft']])
train['Life_Expectancy_Scaled'] = scaler.fit_transform(train[['Life_Expectancy']])

print(train.shape)
print(f"After scaling outliers:\n{train}\n")

(15751, 11)
After scaling outliers:
            Diet_Type    District_Name Occupation Wealth_Index  \
0      Exotic Imports      Coral Slums     Scribe       1491.0   
1             Seafood      Coral Slums     Fisher       1596.0   
2             Seafood    Mariana Plaza    Warrior       3921.0   
3      Exotic Imports      Deep Trench     Fisher       1481.0   
4             Seaweed      Deep Trench    Warrior      25985.0   
...               ...              ...        ...          ...   
15746         Seaweed    Mariana Plaza   Merchant       2896.0   
15747         Seaweed      Coral Slums      Miner       1671.0   
15748  Exotic Imports    Mariana Plaza     Scribe       4656.0   
15749         Seafood  The Golden Reef   Merchant      17529.0   
15750         Seaweed      Coral Slums    Warrior       1439.0   

      House_Size_sq_ft Life_Expectancy Vehicle_Owned    Work_District  \
0                100.0            42.0   Fin Bicycle    Mariana Plaza   
1                100.0   

In [8]:
# Set types on the columns. Inverse transform after imputation is resetting dtype
train['Diet_Type'] = train['Diet_Type'].astype('category')
train['District_Name'] = train['District_Name'].astype('category')
train['Occupation'] = train['Occupation'].astype('category')
train['Vehicle_Owned'] = train['Vehicle_Owned'].astype('category')
train['Work_District'] = train['Work_District'].astype('category')

train['Wealth_Index'] = train['Wealth_Index'].astype('float')
train['House_Size_sq_ft'] = train['House_Size_sq_ft'].astype('float')
train['Life_Expectancy'] = train['Life_Expectancy'].astype('float')

# Do we need this Commute_Out? TO REMOVE LATER
train['Commute_Out'] = (train['District_Name'] != train['Work_District']).astype('category')

print(train.shape)
print(f"After setting types:\n{train}\n")

(15751, 12)
After setting types:
            Diet_Type    District_Name Occupation  Wealth_Index  \
0      Exotic Imports      Coral Slums     Scribe        1491.0   
1             Seafood      Coral Slums     Fisher        1596.0   
2             Seafood    Mariana Plaza    Warrior        3921.0   
3      Exotic Imports      Deep Trench     Fisher        1481.0   
4             Seaweed      Deep Trench    Warrior       25985.0   
...               ...              ...        ...           ...   
15746         Seaweed    Mariana Plaza   Merchant        2896.0   
15747         Seaweed      Coral Slums      Miner        1671.0   
15748  Exotic Imports    Mariana Plaza     Scribe        4656.0   
15749         Seafood  The Golden Reef   Merchant       17529.0   
15750         Seaweed      Coral Slums    Warrior        1439.0   

       House_Size_sq_ft  Life_Expectancy Vehicle_Owned    Work_District  \
0                 100.0             42.0   Fin Bicycle    Mariana Plaza   
1           

In [9]:
# Split features and target columns in the training data

X = train[['Diet_Type','District_Name','Wealth_Index_Scaled','House_Size_sq_ft_Scaled','Life_Expectancy_Scaled','Vehicle_Owned']]

occupation_mapping = {
    "Warrior": 0,
    "Merchant": 1,
    "Fisher": 2,
    "Miner": 3,
    "Scribe": 4
}

y = train["Occupation"].map(occupation_mapping)

# Split the original training data into train and test data to compare different pre-processing and models

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42 # Set a random state for reproducibility
)

In [10]:
# XGBoost is well known for it's performance, particularly for multi-class classification

model = XGBClassifier(
    objective='multi:softmax',
    n_estimators=100,
    learning_rate=0.1,
    eval_metric='logloss',
    use_label_encoder=False, # Set to False to avoid a warning/future deprecation
    enable_categorical=True,
    tree_method="hist"
)

# First train the model with train & test split data
model.fit(X_train, y_train)

# Make Predictions and Evaluate
y_pred = model.predict(X_test)
y_prob = model.predict_proba(X_test) # Get predicted probabilities

print(f"Accuracy: {accuracy_score(y_test, y_pred):.2f}\n")
print(f"F1 Score: {f1_score(y_test, y_pred, average='macro'):.2f}\n")
print("Classification Report:\n", classification_report(y_test, y_pred))

# Display the first few predictions
print("\nPredicted class labels (first 5):", y_pred[:5])
print("Actual class labels (first 5):  ", y_test[:5])
print("\nPredicted probabilities for first sample:\n", y_prob[0])

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.61

F1 Score: 0.60

Classification Report:
               precision    recall  f1-score   support

           0       0.65      0.72      0.68       710
           1       0.73      0.81      0.77       740
           2       0.48      0.39      0.43       606
           3       0.52      0.52      0.52       609
           4       0.63      0.56      0.59       486

    accuracy                           0.61      3151
   macro avg       0.60      0.60      0.60      3151
weighted avg       0.61      0.61      0.61      3151


Predicted class labels (first 5): [0 1 4 4 3]
Actual class labels (first 5):   11819    3
1210     1
9466     4
4337     4
12467    2
Name: Occupation, dtype: category
Categories (5, int64): [2, 1, 3, 4, 0]

Predicted probabilities for first sample:
 [0.3514245  0.03024967 0.32913592 0.21162443 0.07756551]


In [11]:
# Now train the model with complete data from original train data

X_train = X
y_train = y

model.fit(X_train, y_train)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0,1,2
,"objective  objective: str | xgboost.sklearn._SklObjWProto | typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]] | None Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'multi:softmax'
,"base_score  base_score: float | typing.List[float] | None The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.List[xgboost.callback.TrainingCallback] | None List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: float | None Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: float | None Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: float | None Subsample ratio of columns when constructing each tree.,
,"device  device: str | None .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: int | None .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,True


In [12]:
# Read test file

test_original = pd.read_csv('test_atlantis_hidden.csv')
print(test_original.shape)
print(f"Original test data:\n{test_original}\n")

# No need of Citizen_ID and Bio_Hash for prediction
test= test_original.drop(['Citizen_ID','Bio_Hash'],axis=1)

(3938, 9)
Original test data:
     Citizen_ID       Diet_Type    District_Name  Wealth_Index  \
0     CIT_15383         Seaweed    Mariana Plaza          2851   
1     CIT_14830         Seaweed    Mariana Plaza          5176   
2     CIT_17388  Exotic Imports      Deep Trench          3772   
3     CIT_17438         Seaweed      Deep Trench          1288   
4     CIT_16735         Seafood      Deep Trench          1736   
...         ...             ...              ...           ...   
3933  CIT_15659  Exotic Imports    Mariana Plaza          3537   
3934  CIT_16061         Seafood      Coral Slums          1769   
3935  CIT_17913         Seaweed  The Golden Reef          7972   
3936  CIT_17666  Exotic Imports      Coral Slums          1904   
3937   CIT_1623         Seaweed  The Golden Reef          9441   

      House_Size_sq_ft  Life_Expectancy Vehicle_Owned    Work_District  \
0                  453               53   Sea Scooter      Coral Slums   
1                  815       

In [13]:
# Test data pre-processing

# Encode strings to numbers, imputer works on only numeric data
encoder = OrdinalEncoder()
test_encoded = encoder.fit_transform(test)

# Impute missing values

# Initialize the IterativeImputer with a base estimator - RandomForestRegressor
imputer = IterativeImputer(estimator=RandomForestRegressor(), max_iter=5, random_state=0)

imputed_data = imputer.fit_transform(test_encoded)

# Decode back to original strings
test = pd.DataFrame(encoder.inverse_transform(imputed_data), columns=test.columns)

print(f"After imputing:\n{test}\n")

After imputing:
           Diet_Type    District_Name Wealth_Index House_Size_sq_ft  \
0            Seaweed    Mariana Plaza         2851              453   
1            Seaweed    Mariana Plaza         5176              815   
2     Exotic Imports      Deep Trench         3772              390   
3            Seaweed      Deep Trench         1288              131   
4            Seafood      Deep Trench         1736              182   
...              ...              ...          ...              ...   
3933  Exotic Imports    Mariana Plaza         3537              572   
3934         Seafood      Coral Slums         1769              116   
3935         Seaweed  The Golden Reef         7972             1689   
3936  Exotic Imports      Coral Slums         1904              138   
3937         Seaweed  The Golden Reef         9441             1974   

     Life_Expectancy Vehicle_Owned    Work_District  
0                 53   Sea Scooter      Coral Slums  
1                 58   

In [14]:
# Fix outliers and scale the numeric values
scaler = RobustScaler()

# The scaler expects a 2D array, so we reshape the column
# Save the scaled data in new columns
test['Wealth_Index_Scaled'] = scaler.fit_transform(test[['Wealth_Index']])
test['House_Size_sq_ft_Scaled'] = scaler.fit_transform(test[['House_Size_sq_ft']])
test['Life_Expectancy_Scaled'] = scaler.fit_transform(test[['Life_Expectancy']])

print(test.shape)
print(f"After scaling outliers:\n{test}\n")

(3938, 10)
After scaling outliers:
           Diet_Type    District_Name Wealth_Index House_Size_sq_ft  \
0            Seaweed    Mariana Plaza         2851              453   
1            Seaweed    Mariana Plaza         5176              815   
2     Exotic Imports      Deep Trench         3772              390   
3            Seaweed      Deep Trench         1288              131   
4            Seafood      Deep Trench         1736              182   
...              ...              ...          ...              ...   
3933  Exotic Imports    Mariana Plaza         3537              572   
3934         Seafood      Coral Slums         1769              116   
3935         Seaweed  The Golden Reef         7972             1689   
3936  Exotic Imports      Coral Slums         1904              138   
3937         Seaweed  The Golden Reef         9441             1974   

     Life_Expectancy Vehicle_Owned    Work_District  Wealth_Index_Scaled  \
0                 53   Sea Scooter  

In [15]:
# Set types on the columns. Inverse transform after imputation is resetting dtype
test['Diet_Type'] = test['Diet_Type'].astype('category')
test['District_Name'] = test['District_Name'].astype('category')
test['Vehicle_Owned'] = test['Vehicle_Owned'].astype('category')
test['Work_District'] = test['Work_District'].astype('category')

test['Wealth_Index'] = test['Wealth_Index'].astype('float')
test['House_Size_sq_ft'] = test['House_Size_sq_ft'].astype('float')
test['Life_Expectancy'] = test['Life_Expectancy'].astype('float')

# Do we need this Commute_Out? TO REMOVE LATER
test['Commute_Out'] = (test['District_Name'] != test['Work_District']).astype('category')

print(test.shape)
print(f"After setting types:\n{test}\n")

# During prediction, test data should use same columns used for model training

X_test = test[['Diet_Type','District_Name','Wealth_Index_Scaled','House_Size_sq_ft_Scaled','Life_Expectancy_Scaled','Vehicle_Owned']]

(3938, 11)
After setting types:
           Diet_Type    District_Name  Wealth_Index  House_Size_sq_ft  \
0            Seaweed    Mariana Plaza        2851.0             453.0   
1            Seaweed    Mariana Plaza        5176.0             815.0   
2     Exotic Imports      Deep Trench        3772.0             390.0   
3            Seaweed      Deep Trench        1288.0             131.0   
4            Seafood      Deep Trench        1736.0             182.0   
...              ...              ...           ...               ...   
3933  Exotic Imports    Mariana Plaza        3537.0             572.0   
3934         Seafood      Coral Slums        1769.0             116.0   
3935         Seaweed  The Golden Reef        7972.0            1689.0   
3936  Exotic Imports      Coral Slums        1904.0             138.0   
3937         Seaweed  The Golden Reef        9441.0            1974.0   

      Life_Expectancy Vehicle_Owned    Work_District  Wealth_Index_Scaled  \
0             

x_test['Diet_Type'] = le.fit_transform(x_test ['Diet_Type'])
x_test['District_Name'] = le.fit_transform(x_test['District_Name'])
x_test['Vehicle_Owned'] = le.fit_transform(x_test['Vehicle_Owned'])

In [16]:
y_pred = model.predict(X_test)

print("\nPredicted values:\n", y_pred)


Predicted values:
 [4 4 0 ... 1 2 1]


In [17]:
y_prob = model.predict_proba(X_test)

print("\nPredicted probabilities:\n", y_prob)


Predicted probabilities:
 [[0.01743744 0.07232051 0.12697394 0.1291025  0.6541656 ]
 [0.00449384 0.09714023 0.03228898 0.05469956 0.8113774 ]
 [0.81045216 0.00121782 0.14604558 0.04117319 0.00111125]
 ...
 [0.00153642 0.81654966 0.00269532 0.00457268 0.17464595]
 [0.1047172  0.00146507 0.50516254 0.38623407 0.00242116]
 [0.00272716 0.80190146 0.00614542 0.02588637 0.16333948]]


In [18]:
submission = pd.DataFrame({"Citizen_ID":test_original['Citizen_ID'], "Occupation":y_pred})

print(submission)

     Citizen_ID  Occupation
0     CIT_15383           4
1     CIT_14830           4
2     CIT_17388           0
3     CIT_17438           2
4     CIT_16735           2
...         ...         ...
3933  CIT_15659           4
3934  CIT_16061           3
3935  CIT_17913           1
3936  CIT_17666           2
3937   CIT_1623           1

[3938 rows x 2 columns]


In [20]:
submission.to_csv("submission6.csv", index=False, quoting=csv.QUOTE_NONE)