In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Constants: Mapping of entity names to allowed units
entity_unit_map = {
    'width': 'centimetre',
    'depth': 'centimetre',
    'height': 'centimetre',
    'item_weight': 'kilogram',
    'maximum_weight_recommendation': 'kilogram',
    'voltage': 'volt',
    'wattage': 'watt',
    'item_volume': 'litre'
}

# Step 1: Load train and test data
train_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/test.csv')

# Step 2: Extract numeric value from 'entity_value' column
train_df['numeric_value'] = train_df['entity_value'].str.extract(r'(\d+\.\d+|\d+)').astype(float)

# Step 3: Prepare OneHotEncoder for 'group_id' and 'entity_name'
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)

# Fit and transform train data
X_train_cat = ohe.fit_transform(train_df[['group_id', 'entity_name']])
X_test_cat = ohe.transform(test_df[['group_id', 'entity_name']])

# Step 4: Create DataFrame for categorical features
X_train = pd.DataFrame(X_train_cat, columns=ohe.get_feature_names_out())
X_test = pd.DataFrame(X_test_cat, columns=ohe.get_feature_names_out())

# Step 5: Define the target variable
y_train = train_df['numeric_value']

# Step 6: Use a simple DecisionTreeRegressor for faster training
model = DecisionTreeRegressor(random_state=42)
model.fit(X_train, y_train)

# Step 7: Predict numeric values for the test set
pred_numeric_values = model.predict(X_test)

# Step 8: Standardize the unit for each entity type
def standardize_unit(entity_name):
    """
    Get the standardized unit for the entity name from the entity_unit_map.
    If the entity name is not in the map, default to 'unit'.
    """
    return entity_unit_map.get(entity_name, 'unit')

# Step 9: Format predictions
def format_prediction(value, entity_name):
    """
    Format the prediction with standardized units and numeric value.
    """
    unit = standardize_unit(entity_name)
    return f"{value:.2f} {unit}"

# Applying the format_prediction function to the predicted values
test_df['prediction'] = [format_prediction(val, name) for val, name in zip(pred_numeric_values, test_df['entity_name'])]

# Step 10: Create a CSV file for submission
submission_df = test_df[['index', 'prediction']]
submission_df.to_csv('submission.csv', index=False)

print("Submission file saved as submission.csv")




Submission file saved as submission.csv


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive
