In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error

In [None]:
df_train = pd.read_csv('/kaggle/input/godaddy-microbusiness-density-forecasting/train.csv')
df_test = pd.read_csv('/kaggle/input/godaddy-microbusiness-density-forecasting/test.csv')
df_sample_submission = pd.read_csv('/kaggle/input/godaddy-microbusiness-density-forecasting/sample_submission.csv')
df_revealed_test = pd.read_csv('/kaggle/input/godaddy-microbusiness-density-forecasting/revealed_test.csv')
df_census_starter = pd.read_csv('/kaggle/input/godaddy-microbusiness-density-forecasting/census_starter.csv')

In [None]:
df_train

In [None]:
df_test

In [None]:
df_sample_submission

In [None]:
df_revealed_test

In [None]:
df_census_starter

In [None]:
df_train.info()

In [None]:
df_train.drop(['row_id','county','state','active'], axis =1 , inplace = True)

In [None]:
df_train

In [None]:
df_train['first_day_of_month'] = pd.to_datetime(df_train['first_day_of_month'])

# Extract year, month, and day into separate columns
df_train['year'] = df_train['first_day_of_month'].dt.year
df_train['month'] = df_train['first_day_of_month'].dt.month
df_train['day'] = df_train['first_day_of_month'].dt.day

# Drop the old date_column while keeping other columns
df_train = df_train.drop('first_day_of_month', axis=1)

In [None]:
df_train

In [None]:
plt.figure(figsize = (8,6))
sns.heatmap(df_train.corr(), annot = True, cmap = "magma_r")
plt.title("Correlation Heatmap",fontdict = {"fontweight":"bold"})
plt.show()

In [None]:
X_train = df_train.drop(['microbusiness_density'], axis = 1).values

In [None]:
y_train = df_train['microbusiness_density'].values

In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
df_revealed_test.head()

In [None]:
df_revealed_test.drop(['row_id','county','state','active'], axis = 1, inplace = True)

In [None]:
df_revealed_test['first_day_of_month'] = pd.to_datetime(df_revealed_test['first_day_of_month'])

# Extract year, month, and day into separate columns
df_revealed_test['year'] = df_revealed_test['first_day_of_month'].dt.year
df_revealed_test['month'] = df_revealed_test['first_day_of_month'].dt.month
df_revealed_test['day'] = df_revealed_test['first_day_of_month'].dt.day

# Drop the old date_column while keeping other columns
df_revealed_test = df_revealed_test.drop('first_day_of_month', axis=1)

In [None]:
df_revealed_test

In [None]:
X_test = df_revealed_test.drop(['microbusiness_density'], axis = 1).values

In [None]:
y_test = df_revealed_test['microbusiness_density'].values

In [None]:
models = {
    'Linear Regression': (LinearRegression(), {}),
    'Ridge': (Ridge(), {'alpha': [0.001, 0.01, 0.1, 1, 10]}),
    'Lasso': (Lasso(), {'alpha': [0.001, 0.01, 0.1, 1, 10]}),
    'Random Forest': (RandomForestRegressor(), {'n_estimators': [5, 10, 50, 100, 200]}),
#     'Support Vector Machine': (SVR(), {'C': [0.1, 1, 10]}),
    'K-Nearest Neighbors': (KNeighborsRegressor(), {'n_neighbors': [3, 5, 7]}),
    'XGBoost': (XGBRegressor(), {'max_depth': [3, 6, 9], 'learning_rate': [0.1, 0.01, 0.001]})
}

In [None]:
for model_name, (model, param_grid) in models.items():
    print(f"Training {model_name}...")
    grid_search = GridSearchCV(model, param_grid, scoring='neg_mean_absolute_error', cv=3)
    grid_search.fit(X_train, y_train)

    # Get the best model and its performance on the test set
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"Best Parameters for {model_name}: {grid_search.best_params_}")
    print(f"Mean Absolute Error for {model_name}: {mae}\n")

In [None]:
model = RandomForestRegressor(n_estimators=5)

In [None]:
model.fit(X_train,y_train)

In [None]:
df_test

In [None]:
df_test.drop(['row_id'], axis = 1, inplace = True)

In [None]:
df_test['first_day_of_month'] = pd.to_datetime(df_test['first_day_of_month'])

# Extract year, month, and day into separate columns
df_test['year'] = df_test['first_day_of_month'].dt.year
df_test['month'] = df_test['first_day_of_month'].dt.month
df_test['day'] = df_test['first_day_of_month'].dt.day

# Drop the old date_column while keeping other columns
df_test = df_test.drop('first_day_of_month', axis=1)

In [None]:
X_test = df_test.values

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred

In [None]:
solution = pd.DataFrame(y_pred, columns = ["y"])
df = df_sample_submission.join(solution)
df = df.drop(['microbusiness_density'], axis = 1)
df.rename(columns = {'y':'microbusiness_density'}, inplace = True)
df.to_csv('solution.csv', header = True, index = True)