In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

# Load the datasets
train_df = pd.read_csv('/kaggle/input/neural-net-nexus-2-0/train.csv')
test_df = pd.read_csv('/kaggle/input/neural-net-nexus-2-0/test.csv')

# Separate target and features
y = train_df['Revenue']
X = train_df.drop('Revenue', axis=1)
X_test = test_df.drop('ID', axis=1)

# Identify numerical and categorical columns
num_cols = X.select_dtypes(include=['float64', 'int64']).columns
cat_cols = X.select_dtypes(include=['object']).columns

# Handle missing values: median for numbers, most frequent for text
num_imputer = SimpleImputer(strategy='median')
X[num_cols] = num_imputer.fit_transform(X[num_cols])
X_test[num_cols] = num_imputer.transform(X_test[num_cols])

cat_imputer = SimpleImputer(strategy='most_frequent')
X[cat_cols] = cat_imputer.fit_transform(X[cat_cols])
X_test[cat_cols] = cat_imputer.transform(X_test[cat_cols])

# Encode categorical variables
le = LabelEncoder()
for col in cat_cols:
    combined = pd.concat([X[col], X_test[col]], axis=0).astype(str)
    le.fit(combined)
    X[col] = le.transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

# Train a Random Forest model on the full training set
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

# Predict on the test data
predictions = model.predict(X_test)

# Create the submission DataFrame
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'Revenue': predictions
})

# Save to CSV
submission.to_csv('submission.csv', index=False)
print("submission.csv has been created locally.")

submission.csv has been created locally.
