In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.metrics import make_scorer, r2_score

In [2]:
data=pd.read_csv("train.csv")
testdata=pd.read_csv("test.csv")

## Create Dummy Variables for Category

In [3]:
category_OHE=pd.get_dummies(data["Category"])

In [4]:
labelOHE={}
for i in category_OHE.columns:
    Labeler=LabelEncoder()
    labelOHE[i]=Labeler.fit_transform(category_OHE[i])

In [5]:
OHE_data=pd.DataFrame.from_dict(labelOHE)

## Format Dates and Extract Day, Month, Year, and Hour

In [6]:
data=data.drop(["Category", "Descript", "Resolution"], axis=1)

In [7]:
data["Dates"]=pd.to_datetime(data["Dates"])
data["Month"]=[i.month for i in data["Dates"]]
data["Year"]=[i.year for i in data["Dates"]]
data["Day"]=[i.day for i in data["Dates"]]
data["Hour"]=[i.hour for i in data["Dates"]]

In [8]:
testdata["Dates"]=pd.to_datetime(testdata["Dates"])
testdata["Month"]=[i.month for i in testdata["Dates"]]
testdata["Year"]=[i.year for i in testdata["Dates"]]
testdata["Day"]=[i.day for i in testdata["Dates"]]
testdata["Hour"]=[i.hour for i in testdata["Dates"]]

In [9]:
data=data.drop("Dates", axis=1)
testdata=testdata.drop("Dates", axis=1)

## Label Encode Categorical Variables

In [10]:
cat_cols = data.select_dtypes(include=['object', 'category']).columns
testcat_cols = testdata.select_dtypes(include=['object', 'category']).columns

In [11]:
labeld={}
for i in cat_cols:
    Labeler=LabelEncoder()
    labeld[i]=Labeler.fit_transform(data[i])

In [12]:
testlabeld={}
for i in testcat_cols:
    Labeler=LabelEncoder()
    testlabeld[i]=Labeler.fit_transform(testdata[i])

In [13]:
catcoldataset=pd.DataFrame.from_dict(labeld)
testcatcoldataset=pd.DataFrame.from_dict(testlabeld)

In [14]:
data=data.drop(columns=cat_cols, axis=1)
testdata=testdata.drop(columns=testcat_cols, axis=1)

In [15]:
Xdataset=pd.concat([data, catcoldataset], axis=1)
testXdataset=pd.concat([testdata, testcatcoldataset], axis=1)

In [16]:
X=Xdataset
y=OHE_data
X_test=testXdataset.drop("Id", axis=1)

## Standardize Data

In [17]:
scaler=StandardScaler()

In [18]:
scaledX=pd.DataFrame(scaler.fit_transform(X))
scaledtestX=pd.DataFrame(scaler.fit_transform(X_test))

In [19]:
scaledX.columns=X.columns
scaledtestX.columns=X_test.columns

## Random Forest

In [20]:
rf=RandomForestClassifier(random_state=42)

In [21]:
rf.fit(scaledX, y)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [22]:
rf_preds=rf.predict(scaledtestX)

In [23]:
randomforest_predictions=pd.DataFrame(rf_preds)

In [24]:
randomforest_predictions.columns=y.columns

In [25]:
pd.concat([pd.Series(testXdataset["Id"], name="Id"), randomforest_predictions], axis=1).to_csv("sf_crime_submission1.csv", index=False)

## Gradient Boosting Classifier

In [26]:
gb_prediction={}
for i in y.columns:
    gb=GradientBoostingClassifier()
    gb.fit(scaledX, y[i])
    gb_prediction[i]=gb.predict(scaledtestX)

In [27]:
gb_predictions=pd.DataFrame.from_dict(gb_prediction)

In [28]:
pd.concat([pd.Series(testXdataset["Id"], name="Id"), gb_predictions], axis=1).to_csv("sf_crime_submission2.csv", index=False)

## Support Vector Machine

In [29]:
prediction={}
for i in y.columns:
    svm=LinearSVC()
    svm.fit(scaledX, y[i])
    prediction[i]=svm.predict(scaledtestX)   

In [30]:
svm_predictions=pd.DataFrame.from_dict(prediction)

In [31]:
pd.concat([pd.Series(testXdataset["Id"], name="Id"), svm_predictions], axis=1).to_csv("sf_crime_submission3.csv", index=False)