In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV
from sklearn.metrics import mean_squared_error, r2_score
from scipy import stats
import statsmodels.api as sm

master = pd.read_csv('master1.csv')

master = master[master['Primary Type'].isin(['HOMICIDE', 'BATTERY', 'ASSAULT', 'ROBBERY', 'CRIMINAL SEXUAL ASSAULT'])]

columns_drop = ['Case Number', 'Time', 'Block', 'IUCR', 'Primary Type', 'Description', 'Location Description', 'Arrest', 'Domestic','Beat','District','Ward','Community Area','FBI Code','X Coordinate',
                'Y Coordinate','Updated On','Latitude','Longitude','Location', 'Holiday Day of Week','precipprob','snowdepth','preciptype', 'windgust','winddir',
                'solarenergy','sunrise', 'sunset','moonphase', 'description', 'icon','stations']
data = master.drop(columns=columns_drop)

# Replace NaN values with 0 indicating no holiday and severe risk
data['Holiday'] = data['Holiday'].fillna(0)
data['severerisk'] = data['severerisk'].fillna(0)

# Replace non-NaN values with 1 indicating a holiday
data.loc[data['Holiday'] != 0, 'Holiday'] = 1

# Optionally, convert the 'Holiday' column to integer type
data['Holiday'] = data['Holiday'].astype(int)

# drop dates before 2010 as weather does not have that data
data['Date'] = pd.to_datetime(data['Date'], format='%m/%d/%y')
data= data[data['Date'].dt.year >= 2010]

daily_counts = data.groupby('Date').size().reset_index(name='Crime_Count')

# Merge daily_counts with the original DataFrame
merged_data = pd.merge(data, daily_counts, on='Date')

# Drop duplicate rows to keep only one entry per day
final_data = merged_data.drop_duplicates(subset='Date')

# Display the final DataFrame
final_data.head()

final_data['conditions'] = final_data['conditions'].str.split(',').apply(lambda x: [c.strip() for c in x])

# Get the set of all unique conditions
unique_conditions = set(condition for sublist in final_data['conditions'] for condition in sublist)

# Create dummy variables for each unique condition
for condition in unique_conditions:
    final_data.loc[:, condition] = final_data['conditions'].apply(lambda x: 1 if condition in x else 0)

# Drop the original 'Conditions' column
final_data.drop(columns=['conditions'], inplace=True)

final_data.to_csv('test.csv', index=False)

X = final_data.drop(columns=['Crime_Count', 'ID', 'Date'])

# Extract the target variable
y = final_data['Crime_Count']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  master = pd.read_csv('master1.csv')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data['conditions'] = final_data['conditions'].str.split(',').apply(lambda x: [c.strip() for c in x])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_data.loc[:, condition] = final_data['conditions'].apply(lambda x: 1 if condition in x else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexin

In [30]:
# Fit linear regression model
X_train = sm.add_constant(X_train)  # Add constant to X_train
model = sm.OLS(y_train, X_train)    # Create model
results = model.fit()               # Fit model
print(results.summary())

# Make predictions on the test set
X_test = sm.add_constant(X_test)  # Add constant to X_test
y_pred = results.predict(X_test)  # Predict using the fitted model

# Calculate MSE
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error (MSE):", mse)

# Calculate R^2 score
r2 = r2_score(y_test, y_pred)
print("R^2 Score:", r2)

                            OLS Regression Results                            
Dep. Variable:            Crime_Count   R-squared:                       0.240
Model:                            OLS   Adj. R-squared:                  0.235
Method:                 Least Squares   F-statistic:                     47.92
Date:                Tue, 26 Mar 2024   Prob (F-statistic):          1.64e-220
Time:                        16:17:59   Log-Likelihood:                -12091.
No. Observations:                4134   AIC:                         2.424e+04
Df Residuals:                    4106   BIC:                         2.441e+04
Df Model:                          27                                         
Covariance Type:            nonrobust                                         
                                     coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------------
const       

In [21]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'learning_rate': [0.015, 0.02, 0.025],
    'max_depth': [2, 3, 4],
    'n_estimators': [200, 250, 300],
    'subsample': [0.6, 0.7, 0.8],
}



# Initialize XGBoost regressor
xgb_model = xgb.XGBRegressor(random_state=42)

# Perform grid search with cross-validation
grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Mean Squared Error:", best_score)


Fitting 3 folds for each of 81 candidates, totalling 243 fits
[CV] END learning_rate=0.015, max_depth=2, n_estimators=200, subsample=0.6; total time=   0.2s
[CV] END learning_rate=0.015, max_depth=2, n_estimators=200, subsample=0.6; total time=   0.2s
[CV] END learning_rate=0.015, max_depth=2, n_estimators=200, subsample=0.6; total time=   0.2s
[CV] END learning_rate=0.015, max_depth=2, n_estimators=200, subsample=0.7; total time=   0.2s
[CV] END learning_rate=0.015, max_depth=2, n_estimators=200, subsample=0.7; total time=   0.2s
[CV] END learning_rate=0.015, max_depth=2, n_estimators=200, subsample=0.7; total time=   0.2s
[CV] END learning_rate=0.015, max_depth=2, n_estimators=200, subsample=0.8; total time=   0.3s
[CV] END learning_rate=0.015, max_depth=2, n_estimators=200, subsample=0.8; total time=   0.2s
[CV] END learning_rate=0.015, max_depth=2, n_estimators=200, subsample=0.8; total time=   0.1s
[CV] END learning_rate=0.015, max_depth=2, n_estimators=250, subsample=0.6; total t

In [27]:
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# Define the parameter grid for hyperparameter tuning
param_dist = {
    'learning_rate': uniform(0.01, 0.1),  # Continuous uniform distribution
    'max_depth': randint(3, 10),           # Discrete uniform distribution
    'n_estimators': randint(100, 500),     # Discrete uniform distribution
    'subsample': uniform(0.5, 0.5),        # Continuous uniform distribution
    'colsample_bytree': uniform(0.5, 0.5), # Continuous uniform distribution
    'gamma': uniform(0, 0.5),              # Continuous uniform distribution
}

# Initialize XGBoost regressor
xgb_model = xgb.XGBRegressor(random_state=42)

# Initialize k-fold cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Perform k-fold cross-validation
mse_scores = []
for train_index, val_index in kf.split(X_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[val_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]
    
    # Perform random search cross-validation
    xgb_random = RandomizedSearchCV(estimator=xgb_model, param_distributions=param_dist, n_iter=100,
                                    scoring='neg_mean_squared_error', cv=3, verbose=2, random_state=42, n_jobs=-1)
    xgb_random.fit(X_train_fold, y_train_fold)
    
    # Get the best model from the random search
    xgb_best_model = xgb_random.best_estimator_
    
    # Make predictions on the validation set
    xgb_predictions = xgb_best_model.predict(X_val_fold)
    
    # Calculate MSE and store it
    mse = mean_squared_error(y_val_fold, xgb_predictions)
    mse_scores.append(mse)

# Calculate the mean MSE across all folds
mean_mse = np.mean(mse_scores)
print("Mean Cross-Validated Mean Squared Error:", mean_mse)



Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Mean Cross-Validated Mean Squared Error: 20.508155951070343


In [28]:
from sklearn.metrics import r2_score

# Make predictions on the test set using the best model
xgb_best_model = xgb_random.best_estimator_
xgb_predictions = xgb_best_model.predict(X_test)

# Calculate R-squared
r_squared = r2_score(y_test, xgb_predictions)
print("R-squared:", r_squared)

R-squared: 0.2658636299437305
