In [87]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
import pandas as pd

# Handling categorical attributes
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv('~/Documents/datasets/crime.csv',nrows=100000)


encoder = LabelEncoder()
#df = data.apply(encoder.fit_transform)
#data["Category"] = encoder.fit_transform(data["Category"].astype('str'))
attributes = [ 'Category',  'DayOfWeek', 'PdDistrict', 'Resolution', 'X', 'Y']
df = data[attributes]
day = pd.get_dummies(df['DayOfWeek'])
district = pd.get_dummies(df['PdDistrict'])
resolution = pd.get_dummies(df['Resolution'])
crime_cat = df["Category"]
crimes = df.drop(['Category','DayOfWeek','PdDistrict','Resolution'], axis=1)
dat = pd.concat([crimes,day,district,resolution],axis=1)

X_train, X_test, y_train ,y_test = train_test_split(dat, crime_cat, random_state= 0)


#df.head()
forest = RandomForestClassifier(n_jobs=2,max_depth=1000,n_estimators=100, random_state=0)
forest.fit(X_train, y_train)

print('Accuracy on the training subset: {:3f}'.format(forest.score(X_train, y_train)))
print('Accuracy on the testing subset: {:3f}'.format(forest.score(X_test, y_test)))


Accuracy on the training subset: 0.738293
Accuracy on the testing subset: 0.373440


In [86]:
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelBinarizer
import numpy as np

'''
Load some data and do some cleaning
'''
crimes = pd.read_csv("~/Documents/datasets/crime.csv",nrows = 100000)

#crimes['Mapping'] = np.abs(crimes["X"] + crimes["Y"]) 
crimes.drop(['IncidntNum','Descript','Location'],1,inplace=True)

# custom multicolumn transformer

class MultiColumnLabelEncoder:
    def __init__(self, columns = None):
        self.columns = columns # array of column names to encode
        
    def fit(self, X,y=None):
        return self # not relevant
    def transform(self, X):
        ''' 
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all columns in X
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
             for colname,col in enumerate(output):
                output[colname] = LabelEncoder().fit_transform(col)
        
        return output
    
    def fit_transform(self, X ,y=None):
        return self.fit(X,y).transform(X)
    
### you can also use the above class as shown in the line below ###
#MultiColumnLabelEncoder(columns = cat_attributes).fit_transform(olympics)
                    
def column_types(df):
    cat_cols = []
    num_cols = []
    
    for y in df.columns:
        if (df[y].dtype == object):
            cat_cols.append(y)
        else:
            num_cols.append(y)
    
    return cat_cols, num_cols

# Handling categorical attributes
class DataFrameSelector(BaseEstimator,TransformerMixin):
    def __init__(self, attribute_names):
        self.attribute_names = attribute_names
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X[self.attribute_names].values

class MyLabelBinarizer():
    def __init__(self, *args, **kwargs):
        self.encoder = LabelBinarizer( *args, **kwargs)
    def fit(self, x, y = 0):
        self.encoder.fit(x)
        return self
    def transform(self,x, y=0):
        return self.encoder.transform(x)

cat_attributes ,num_attributes = column_types(crimes)

num_pipeline = Pipeline([
    ('selector',DataFrameSelector(num_attributes)),
    ('std_scaler', StandardScaler())
])

cat_pipeline = Pipeline([
    ('selector',DataFrameSelector(cat_attributes)),
    ('label_encoder', MultiColumnLabelEncoder())
]) 

full_pipeline = FeatureUnion(transformer_list=[
    ("num_pipeline", num_pipeline),
    ("cat_pipeline", cat_pipeline),
])

prepared = full_pipeline.fit_transform(crimes)

labels = crimes["Category"]

X_train, X_test, y_train ,y_test = train_test_split(prepared, labels, random_state= 0,test_size = 0.25)
forest = RandomForestClassifier(n_jobs=-1,max_depth=100,n_estimators=100, random_state=0)
forest.fit(X_train, y_train)
label_rf = forest.predict(X_test)
print('Accuracy on the training subset: {:3f}'.format(forest.score(X_train, y_train)))
print('Accuracy on the testing subset: {:3f}'.format(forest.score(X_test, y_test)))


Accuracy on the training subset: 0.960467
Accuracy on the testing subset: 0.573840


In [5]:
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA


'''
Load some data and do some cleaning
#NOTE: This dataset contains one million rows so it takes a considerable time 
        to load depending on your computing power and memory resources
'''
data = pd.read_csv('~/Documents/datasets/crime.csv',nrows=100000)

data = data.sample(frac=0.05, random_state=1)
#data['PdId'] = data['PdId'] / 10e3
to_drop = ['IncidntNum','Category','PdId']
X_all = data.drop(to_drop,1)
y_all = data["Category"]


#Standardising the data
num_features = ['X','Y']
cat_features = ['Descript', 'DayOfWeek', 'Date', 'Time', 'PdDistrict', 'Resolution',
       'Address', 'Location']

scaled_data = StandardScaler().fit_transform(X_all[num_features])
scaled = pd.DataFrame(scaled_data,columns = num_features)
dummies = pd.get_dummies(X_all[cat_features],prefix = [col for col, col_data in X_all[cat_features].iteritems()])



pca=PCA(n_components=600)

x_pca = pca.fit_transform(dummies)

#df = dummies.join(scaled)

X_train, X_test, y_train ,y_test = train_test_split(x_pca, y_all, random_state= 0,test_size = 0.25)
pca.explained_variance_ratio_.sum()

0.7196007339903274

In [3]:
%matplotlib inline
import pandas as pd
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import datetime as dt
from matplotlib import style
from sklearn.metrics import classification_report
from time import time

clf_A = LogisticRegression(solver='lbfgs',random_state = 42,multi_class='auto')
clf_B = SVC(random_state = 912,kernel = 'rbf',gamma = 'scale')
clf_C = RandomForestClassifier(n_estimators = 100,max_depth = 10,random_state=82)
clf_D = xgb.XGBClassifier(seed=82)

print('Logistic Regresssion...')
start  = time()
clf_A.fit(X_train, y_train)
end = time()
print('Done in {} seconds...'.format(end - start))
y_pred_A = clf_A.predict(X_train)
#print(classification_report(y_all,y_pred_A))
print('Accuracy on the training subset: {:3f}'.format(clf_A.score(X_train, y_train)))
print('Accuracy on the testing subset: {:3f}'.format(clf_A.score(X_test, y_test)))
print('')

print('Support Vector Machine...')
start  = time()
clf_B.fit(X_train, y_train)
end = time()
print('Done in {} seconds...'.format(end - start))
y_pred_B = clf_B.predict(X_train)
print('Accuracy on the training subset: {:3f}'.format(clf_B.score(X_train, y_train)))
print('Accuracy on the testing subset: {:3f}'.format(clf_B.score(X_test, y_test)))
print('')

print('Random Forest...')
start = time()
clf_C.fit(X_train, y_train)
end = time()
print('Done in {} seconds...'.format(end - start))
y_pred_C = clf_C.predict(X_train)
#print(classification_report(y_all,y_pred_C))
print('Accuracy on the training subset: {:3f}'.format(clf_C.score(X_train, y_train)))
print('Accuracy on the testing subset: {:3f}'.format(clf_C.score(X_test, y_test)))
print('')

print('XGBoost Classifier...')
start = time()
clf_D.fit(X_train, y_train)
end = time()
print('Done in {} seconds...'.format(end - start))
y_pred_D = clf_D.predict(X_train)
print('Accuracy on the training subset: {:3f}'.format(clf_D.score(X_train, y_train)))
print('Accuracy on the testing subset: {:3f}'.format(clf_D.score(X_test, y_test)))
print('')


Logistic Regresssion...
Done in 1.747060775756836 seconds...
Accuracy on the training subset: 0.980533
Accuracy on the testing subset: 0.922400

Support Vector Machine...
Done in 15.893916845321655 seconds...
Accuracy on the training subset: 0.774133
Accuracy on the testing subset: 0.744000

Random Forest...
Done in 7.785552501678467 seconds...
Accuracy on the training subset: 0.815733
Accuracy on the testing subset: 0.751200

XGBoost Classifier...
Done in 298.2690894603729 seconds...
Accuracy on the training subset: 0.998933
Accuracy on the testing subset: 0.867200



In [23]:
'''
def clean_dataset(df):
    assert isinstance(df, pd.DataFrame), "df needs to be a pd.DataFrame"
    df.dropna(inplace=True)
    indices_to_keep = ~df.isin([np.nan, np.inf, -np.inf]).any(1)
    return df[indices_to_keep].astype(np.float64)

'''