In [3]:
#PERFORM RANDOMFOREST MODEL

import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score



# USING FUNCTION TO HANDLE OUTLIERS
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return df



# DATASET
train_data = pd.read_csv(r"C:\Users\nh013\Desktop\goodaddy microbusiness dataset\train.csv")
test_data = pd.read_csv(r"C:\Users\nh013\Desktop\goodaddy microbusiness dataset\test.csv")
revealed_test_data = pd.read_csv(r"C:\Users\nh013\Desktop\goodaddy microbusiness dataset\revealed_test.csv")
census_data = pd.read_csv(r"C:\Users\nh013\Desktop\goodaddy microbusiness dataset\census_starter.csv")




# TRAIN DATASET FEATURE
train_df = train_data[['row_id', 'cfips', 'first_day_of_month', 'microbusiness_density']]

# EXTRACT TIME BASED FEATURE 
train_df['year'] = pd.to_datetime(train_df['first_day_of_month']).dt.year
train_df['month'] = pd.to_datetime(train_df['first_day_of_month']).dt.month
train_df['day'] = pd.to_datetime(train_df['first_day_of_month']).dt.day

#NUMERIC COLUMN
train_numeric_columns = train_df.select_dtypes(include='number')

# CONVERT NUMERIC COLUMN TO FLOAT
train_numeric_columns = train_numeric_columns.astype(float)

# CATEGORICAL COLUMN
train_categorical_columns = train_df.select_dtypes(include='object')

# LABEL ENCODING TO CATEGORICAL COLUMN
label_encoder = LabelEncoder()
train_categorical_columns = train_categorical_columns.apply(label_encoder.fit_transform)

# CONCATENATE NUERIC AND LABEL ENCODED CATEGORICAL COLUMN
train_df = pd.concat([train_numeric_columns, train_categorical_columns], axis=1)





# TEST DATASET FEATURE
test_df = test_data[['row_id', 'cfips', 'first_day_of_month']]

# EXTRACT TIME BASED FEATURE
test_df['year'] = pd.to_datetime(test_df['first_day_of_month']).dt.year
test_df['month'] = pd.to_datetime(test_df['first_day_of_month']).dt.month
test_df['day'] = pd.to_datetime(test_df['first_day_of_month']).dt.day

# NUMERIC COLUMN
test_numeric_columns = test_df.select_dtypes(include='number')

# NUMERIC COLUMN TO FLOAT
test_numeric_columns = test_numeric_columns.astype(float)

# CATEGORICAL COLUMN
test_categorical_columns = test_df.select_dtypes(include='object')

# LABEL ENCODING TO CATEGORICAL COLUMN
test_categorical_columns = test_categorical_columns.apply(label_encoder.fit_transform)

#CONCATENETE NUMERIC AND LABEL ENCODED CATEGORCAL COLUMN
test_df = pd.concat([test_numeric_columns, test_categorical_columns], axis=1)





# REVEALED DATASET FEATURE
revealed_df = revealed_test_data[['row_id', 'cfips', 'first_day_of_month', 'microbusiness_density', 'active']]

# TIME BASED FETURE
revealed_df['year'] = pd.to_datetime(revealed_df['first_day_of_month']).dt.year
revealed_df['month'] = pd.to_datetime(revealed_df['first_day_of_month']).dt.month
revealed_df['day'] = pd.to_datetime(revealed_df['first_day_of_month']).dt.day

# NUMERIC COLUMN
revealed_numeric_columns = revealed_df.select_dtypes(include='number')

# NUMERIC COLUMN TO FLOAT
revealed_numeric_columns = revealed_numeric_columns.astype(float)

# CATEGORICAL COLUMN
revealed_categorical_columns = revealed_df.select_dtypes(include='object')

#LABEL ENCODING TO CATEGORICL COLUMN
revealed_categorical_columns = revealed_categorical_columns.apply(label_encoder.fit_transform)

# CONCATENATE NUMERIC AND LABEL ENCODED CATEGORICAL COLUMN
revealed_df = pd.concat([revealed_numeric_columns, revealed_categorical_columns], axis=1)





# CENSUS_STARTER DATASET FEATURE
census_df = census_data[['pct_bb_2017', 'pct_bb_2018', 'pct_bb_2019', 'pct_bb_2020', 'pct_bb_2021', 'cfips',
                         'pct_college_2017', 'pct_college_2018', 'pct_college_2019', 'pct_college_2020',
                         'pct_college_2021', 'pct_foreign_born_2017', 'pct_foreign_born_2018',
                         'pct_foreign_born_2019', 'pct_foreign_born_2020', 'pct_foreign_born_2021',
                         'pct_it_workers_2017', 'pct_it_workers_2018', 'pct_it_workers_2019', 'pct_it_workers_2020',
                         'pct_it_workers_2021', 'median_hh_inc_2017', 'median_hh_inc_2018', 'median_hh_inc_2019',
                         'median_hh_inc_2020', 'median_hh_inc_2021']]

# NUMERIC COLUMN
census_numeric_columns = census_df.select_dtypes(include='number')

# CONVERT NUMERIC COLUMN TO FLOAT
census_numeric_columns = census_numeric_columns.astype(float)

# CATEGORICAL COLUMN
census_categorical_columns = census_df.select_dtypes(include='object')

# LABEL ENCODING TO CATEGORICAL COLUMN
census_categorical_columns = census_categorical_columns.apply(label_encoder.fit_transform)

# CONCATENATE NUMERIC AND LABEL ENCODED CATEGORICAL COLUMN
census_df = pd.concat([census_numeric_columns, census_categorical_columns], axis=1)




# FILL MISSING IN TRAIN DATASET 
train_df.fillna(train_df.mean(), inplace=True)  # Mean for numeric columns
train_df.fillna(train_df.mode().iloc[0], inplace=True)  # Mode for categorical columns



#FILL MISSING IN TEST DATASET
test_df.fillna(test_df.mean(), inplace=True)   # Mean for numeric columns
test_df.fillna(test_df.mode().iloc[0], inplace=True)  # Mode for categorical columns


# FILL MISSING IN REVEALD DATASET
revealed_df.fillna(revealed_df.mean(), inplace=True)  # Mean for numeric columns
revealed_df.fillna(revealed_df.mode().iloc[0], inplace=True)  # Mode for categorical columns



# FILL MISSING IN CENSUS DATASET

census_df.fillna(0, inplace=True)  # Fill missing values with 0 for all columns



# HANDLE OUTLIERS
train_df = handle_outliers(train_df, 'microbusiness_density')

# NORMALIZE AND SCALING 
scaler = MinMaxScaler()
train_df[train_numeric_columns.columns] = scaler.fit_transform(train_df[train_numeric_columns.columns])

 
    
    
    
# SEPARATE  TARGET AND FEATURE  VARIABLE 
X = train_df.drop(['row_id', 'microbusiness_density'], axis=1)
y = train_df['microbusiness_density']



# SPLIT DATA INTO TRAINING AND VALIDATION TEST 
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)



# TRAIN RANDOM FOREST MODEL
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# MAKE PREDICTION ON THE VALIDATION SET
y_pred = model.predict(X_val)

# EVALUATE
mse = mean_squared_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)
print("Mean Squared Error:", mse)
print("R-squared Score:", r2)


#FORECAST NEXT MONTHS MICROBUSINESS DENSITY USING THE TRAINED MODEL l
X_test = test_df.drop('row_id', axis=1)
y_pred_test = model.predict(X_test)



# ADD THE PREDICTED MICRO BUSINESS DENSITY TO THE TEST DATAFRAME 
test_df['predicted_microbusiness_density'] = y_pred_test

print(test_df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['year'] = pd.to_datetime(train_df['first_day_of_month']).dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['month'] = pd.to_datetime(train_df['first_day_of_month']).dt.month
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  revealed_df['year'] = pd.to_datetime(revealed_df['first

Mean Squared Error: 0.000239048396915313
R-squared Score: 0.9946163235886513
         cfips    year  month  day  row_id  first_day_of_month  \
0       1001.0  2022.0   11.0  1.0      24                   0   
1       1003.0  2022.0   11.0  1.0      32                   0   
2       1005.0  2022.0   11.0  1.0      40                   0   
3       1007.0  2022.0   11.0  1.0      48                   0   
4       1009.0  2022.0   11.0  1.0      56                   0   
...        ...     ...    ...  ...     ...                 ...   
25075  56037.0  2023.0    6.0  1.0   24007                   7   
25076  56039.0  2023.0    6.0  1.0   24015                   7   
25077  56041.0  2023.0    6.0  1.0   24023                   7   
25078  56043.0  2023.0    6.0  1.0   24031                   7   
25079  56045.0  2023.0    6.0  1.0   24039                   7   

       predicted_microbusiness_density  
0                             0.178973  
1                             0.178973  
2      