Importing the libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix



              Common.Name       Date   Time  n.observers     County  Sub.cell  \
0              Asian Koel  7/16/2015  16:30          2.0  Alappuzha  [51,2,2]   
1  Black-rumped Flameback  7/16/2015  16:30          2.0  Alappuzha  [51,2,2]   
2            Black Drongo  7/16/2015  16:30          2.0  Alappuzha  [51,2,2]   
3           Brahminy Kite  7/16/2015  16:30          2.0  Alappuzha  [51,2,2]   
4             Common Myna  7/16/2015  16:30          2.0  Alappuzha  [51,2,2]   

  Season  DEM       Cell.ID List.ID  
0    Wet  5.0  [76.28,9.84]  List.1  
1    Wet  5.0  [76.28,9.84]  List.1  
2    Wet  5.0  [76.28,9.84]  List.1  
3    Wet  5.0  [76.28,9.84]  List.1  
4    Wet  5.0  [76.28,9.84]  List.1  


Loading dataset

In [None]:
data = pd.read_csv('/kba_data.csv')

Looking data types

In [None]:
print(data.info())

Checking first 5 row

In [None]:
print(data.head())

Perform One-Hot Encoding for categorical variables: "Season"


In [None]:
df_encoded = pd.get_dummies(data, columns=["Season"], drop_first=True)


Separate independent and dependent variables

In [None]:
X = df_encoded.drop("DEM", axis=1)  # Assuming "DEM" is the target variable
y = df_encoded["DEM"]

Splitting train and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Check for missing values

In [None]:
missing_values = data.isnull().sum()
print("Missing Values:\n", missing_values)

Drop rows with missing values

In [None]:
data.dropna(inplace=True)

Normalize numerical features if necessary.For example, using StandardScaler

In [None]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data[['n.observers']] = scaler.fit_transform(data[['n.observers']])


In [None]:
print(data.columns)

Index(['Common.Name', 'Date', 'Time', 'n.observers', 'County', 'Sub.cell',
       'Season', 'DEM', 'Cell.ID', 'List.ID', 'Season_new'],
      dtype='object')


One Hot Encoder

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Assuming 'df' is your DataFrame with columns: Common.Name, Date, Time, n.observers, County, Sub.cell, Season, DEM, Cell.ID, List.ID, Season_new

# Convert specified columns to category type
columns_to_convert = ['Common.Name', 'Date', 'Time', 'County', 'Sub.cell', 'Season', 'DEM', 'Cell.ID', 'List.ID']
for col in columns_to_convert:
    data[col] = data[col].astype('category')

# Assign numerical values to the categorical columns and store them in new columns
for col in columns_to_convert:
    data[col+'_new'] = data[col].cat.codes

# Create an instance of OneHotEncoder
encoder = OneHotEncoder()

# Perform One-Hot Encoding on the specified categorical columns and store the encoded data in a DataFrame
enc_data = pd.DataFrame(encoder.fit_transform(data[[col+'_new' for col in columns_to_convert]]).toarray(), columns=encoder.get_feature_names_out(columns_to_convert))

# Merge the encoded DataFrame with the original DataFrame
new_df = pd.concat([data, enc_data], axis=1)

# Drop the original categorical columns and the numerical encoding columns
new_df.drop(columns_to_convert + [col+'_new' for col in columns_to_convert], axis=1, inplace=True)

print(new_df)


ValueError: input_features is not equal to feature_names_in_

In [None]:
df_encoded = pd.get_dummies(df, columns=["DEM", "season", "county"], drop_first=True)

In [None]:
X = df_encoded.drop("charges", axis=1)
y = df_encoded["charges"]

# Split X and y into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the StandardScaler
scaler = StandardScaler()

# Scale the features
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


Regression Models

In [None]:
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Select regression models
linear_regression_model = LinearRegression()
ridge_regression_model = Ridge()
decision_tree_regression_model = DecisionTreeRegressor()
random_forest_regression_model = RandomForestRegressor()
gradient_boosting_regression_model = GradientBoostingRegressor()

# Train regression models
linear_regression_model.fit(X_train_scaled, y_train)
ridge_regression_model.fit(X_train_scaled, y_train)
decision_tree_regression_model.fit(X_train_scaled, y_train)
random_forest_regression_model.fit(X_train_scaled, y_train)
gradient_boosting_regression_model.fit(X_train_scaled, y_train)

In [None]:
from sklearn.metrics import mean_squared_error

# Predict on the training set
y_train_pred_linear = linear_regression_model.predict(X_train_scaled)
y_train_pred_ridge = ridge_regression_model.predict(X_train_scaled)
y_train_pred_decision_tree = decision_tree_regression_model.predict(X_train_scaled)
y_train_pred_random_forest = random_forest_regression_model.predict(X_train_scaled)
y_train_pred_gradient_boosting = gradient_boosting_regression_model.predict(X_train_scaled)

# Calculate MSE for training predictions
mse_linear = mean_squared_error(y_train, y_train_pred_linear)
mse_ridge = mean_squared_error(y_train, y_train_pred_ridge)
mse_decision_tree = mean_squared_error(y_train, y_train_pred_decision_tree)
mse_random_forest = mean_squared_error(y_train, y_train_pred_random_forest)
mse_gradient_boosting = mean_squared_error(y_train, y_train_pred_gradient_boosting)

# Print the MSE for each model
print("MSE - Linear Regression:", mse_linear)
print("MSE - Ridge Regression:", mse_ridge)
print("MSE - Decision Tree Regression:", mse_decision_tree)
print("MSE - Random Forest Regression:", mse_random_forest)
print("MSE - Gradient Boosting Regression:", mse_gradient_boosting)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Define regression models
models = [
    LinearRegression(),
    Ridge(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    GradientBoostingRegressor()
]

# Create a custom scorer for cross-validation (negative mean squared error)
scorer = make_scorer(mean_squared_error, greater_is_better=False)

# Perform cross-validation and evaluate models
results = {}
for model in models:
    scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring=scorer)
    results[str(model)] = -scores.mean()  # Convert negative MSE back to positive for better interpretation

# Print the cross-validation results
for model, score in results.items():
    print(f"{model}: Mean MSE = {score:.2f}")

# Choose the best performing model
best_model = min(results, key=results.get)
print("\nBest performing model:", best_model)


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],   # Number of boosting stages
    'learning_rate': [0.01, 0.1, 0.2], # Step size at each iteration
    'max_depth': [3, 4, 5]             # Maximum depth of individual regression estimators
}

# Create the GridSearchCV object
grid_search = GridSearchCV(GradientBoostingRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform the grid search
grid_search.fit(X_train_scaled, y_train)

# Print the best parameters and best score
print("Best parameters:", grid_search.best_params_)
print("Best negative MSE:", -grid_search.best_score_)
