# San Fransisco Crime Classification

In [3]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [4]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

In [5]:
# Converting the Dates to Year, Month, Day, Hour columns
def date_extract (df):
    
    df_date_to_cols = df.copy()
    
    # Convert the 'Dates' column to a datetime object
    df_date_to_cols['Dates'] = pd.to_datetime(df_date_to_cols['Dates'])

    # Extract Year, Month, Day, and Hour
    df_date_to_cols['Year'] = df_date_to_cols['Dates'].dt.year
    df_date_to_cols['Month'] = df_date_to_cols['Dates'].dt.month
    df_date_to_cols['Day'] = df_date_to_cols['Dates'].dt.day
    df_date_to_cols['Hour'] = df_date_to_cols['Dates'].dt.hour

    # Calculate fractional hour (Hour + Minutes / 60)
    df_date_to_cols['FractionalHour'] = df_date_to_cols['Dates'].dt.hour + df_date_to_cols['Dates'].dt.minute / 60
    
    return df_date_to_cols

In [6]:
# Calling the function date_extract to split up the date-time into different columns for each of the training and test set.
df_train = date_extract(df_train)
df_test = date_extract(df_test)

In [7]:
# Dropping the 'Dates' column after we've extracted what we want.  Also dropping the address column
df_train.drop('Dates', axis=1, inplace=True)
df_test.drop('Dates', axis=1, inplace=True)

df_train.drop('Address', axis=1, inplace=True)
df_test.drop('Address', axis=1, inplace=True)

In [8]:
df_train.head()

Unnamed: 0,Category,Descript,DayOfWeek,PdDistrict,Resolution,X,Y,Year,Month,Day,Hour,FractionalHour
0,WARRANTS,WARRANT ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",-122.425892,37.774599,2015,5,13,23,23.883333
1,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",-122.425892,37.774599,2015,5,13,23,23.883333
2,OTHER OFFENSES,TRAFFIC VIOLATION ARREST,Wednesday,NORTHERN,"ARREST, BOOKED",-122.424363,37.800414,2015,5,13,23,23.55
3,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,NORTHERN,NONE,-122.426995,37.800873,2015,5,13,23,23.5
4,LARCENY/THEFT,GRAND THEFT FROM LOCKED AUTO,Wednesday,PARK,NONE,-122.438738,37.771541,2015,5,13,23,23.5


In [9]:
# Encode categorical features (DayOfWeek, PdDistrict) using one-hot encoding
df_train = pd.get_dummies(df_train, columns=['DayOfWeek', 'PdDistrict'])

# Encode the target variable (Category) into numerical labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
df_train['Category'] = label_encoder.fit_transform(df_train['Category'])

# Data Preprocessing for Test Data
# Encode categorical features in the same way as training data
df_test = pd.get_dummies(df_test, columns=["DayOfWeek", "PdDistrict"])

In [10]:
X_train = df_train.drop(columns=['Category','Descript','Resolution'], axis=1)
y_train = df_train['Category']

In [11]:
df_test_id = df_test['Id']
X_test = df_test.drop('Id', axis=1)

## Model Training

In [12]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [13]:
y_train

81381     20
238545    20
823641    36
497355     1
484193     7
          ..
259178    16
365838    27
131932    16
671155    21
121958    16
Name: Category, Length: 702439, dtype: int64

In [14]:
# Convert the training and testing data to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_val, label=y_val)

# Set the XGBoost parameters
params = {
    'objective': 'multi:softmax',
    'num_class': len(label_encoder.classes_),
    'max_depth': 3,
    'eta': 0.3
}

# Train the XGBoost model
num_rounds = 100
xgb_model = xgb.train(params, dtrain, num_rounds)

In [15]:
# Make predictions on the validation set
y_pred = xgb_model.predict(dtest)

In [16]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc, cohen_kappa_score, log_loss
import matplotlib.pyplot as plt

# Evaluate the Accuracy of the model
accuracy = accuracy_score(y_val, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Calculate the confusion matrix
cm = confusion_matrix(y_val, y_pred)
print(f'Confusion Matrix:\n{cm}')

# Generate a classification report
cr = classification_report(y_val, y_pred, target_names=label_encoder.classes_)
print(f'Classification Report:\n{cr}')

# Calculate Cohen's Kappa score
kappa = cohen_kappa_score(y_val, y_pred)
print(f'Cohen\'s Kappa Score: {kappa:.2f}')

# Calculate the log loss
y_pred_prob = xgb_model.predict(dtest, output_margin=True)
logloss = log_loss(y_val, y_pred_prob)
print(f'Log Loss: {logloss:.2f}')

Accuracy: 0.28
Confusion Matrix:
[[   2   28    0 ...   19    0    0]
 [   1 1076    0 ...  643    0    0]
 [   0    0    0 ...    4    0    0]
 ...
 [   1  218    0 ... 2261    0    0]
 [   0  249    0 ...  152    1    0]
 [   0  137    0 ...   55    0    0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Classification Report:
                             precision    recall  f1-score   support

                      ARSON       0.22      0.01      0.01       321
                    ASSAULT       0.21      0.07      0.10     15364
                 BAD CHECKS       0.00      0.00      0.00        72
                    BRIBERY       0.00      0.00      0.00        58
                   BURGLARY       0.21      0.01      0.01      7389
         DISORDERLY CONDUCT       0.29      0.05      0.09       828
DRIVING UNDER THE INFLUENCE       0.00      0.00      0.00       459
              DRUG/NARCOTIC       0.32      0.38      0.35     10723
                DRUNKENNESS       0.00      0.00      0.00       857
               EMBEZZLEMENT       0.33      0.00      0.01       222
                  EXTORTION       0.00      0.00      0.00        51
            FAMILY OFFENSES       0.00      0.00      0.00        92
     FORGERY/COUNTERFEITING       0.20      0.02      0.04      2092
          



In [17]:
dtest = xgb.DMatrix(X_test)
# Make predictions on the test data
test_predictions = xgb_model.predict(dtest)

In [18]:
test_predictions = test_predictions.astype(int)

In [19]:
# Inverse transform to get the original labels
predicted_labels_original = label_encoder.inverse_transform(test_predictions)

# Now, 'predicted_labels_original' will contain the original labels
print(predicted_labels_original)

['VEHICLE THEFT' 'OTHER OFFENSES' 'LARCENY/THEFT' ... 'OTHER OFFENSES'
 'OTHER OFFENSES' 'LARCENY/THEFT']


In [20]:
categories = label_encoder.classes_

In [31]:
len(categories)

39

In [34]:
columns = np.insert(categories, 0, 'Id')

In [35]:
columns

array(['Id', 'ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
       'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
       'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION',
       'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING',
       'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING',
       'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES',
       'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE',
       'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE',
       'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE',
       'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT',
       'WARRANTS', 'WEAPON LAWS'], dtype=object)

In [36]:
# Create an empty list to store rows
rows = []

# Iterate through predictions and IDs to fill the list
for id, category in zip(df_test_id, predicted_labels_original):
    row = [id] + [1 if cat == category else 0 for cat in categories]
    rows.append(row)

# Create the result DataFrame by concatenating the rows
results_df = pd.DataFrame(rows, columns=columns)

In [37]:
results_df

Unnamed: 0,Id,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,...,SEX OFFENSES NON FORCIBLE,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
884257,884257,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
884258,884258,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
884259,884259,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
884260,884260,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
results_df.to_csv('mysubmission.csv', index=False)