# Classifications of South African Crimes based on Severity
##### Severity Scale Ranks crimes from minor to serious crimes based on given data

## Import Libraries

In [3]:
# Default Libraries
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline

# Classification Libraries
import nltk
from nltk.util import ngrams
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from nltk.classify import MaxentClassifier
from nltk.stem.snowball import SnowballStemmer
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.feature_extraction.text import TfidfTransformer

### Load Data

In [4]:
# Read CSV File and Add Totals Column 
population_stats = pd.read_csv('../Data Storage/Crime Stats-za/ProvincePopulation.csv').sort_values('Population',ascending=False)
crime_stats = pd.read_csv('../Data Storage/Crime Stats-za/SouthAfricaCrimeStats_v2.csv')
crime_stats['Total 2005-2016'] = crime_stats.sum(axis=1)
crime_stats.head()

  crime_stats['Total 2005-2016'] = crime_stats.sum(axis=1)


Unnamed: 0,Province,Station,Category,2005-2006,2006-2007,2007-2008,2008-2009,2009-2010,2010-2011,2011-2012,2012-2013,2013-2014,2014-2015,2015-2016,Total 2005-2016
0,Western Cape,Cape Town Central,All theft not mentioned elsewhere,6692,6341,5966,5187,4985,5127,5285,5937,5600,5335,5176,61631
1,Gauteng,Jhb Central,All theft not mentioned elsewhere,6093,4602,3761,3610,3267,3037,2886,2638,2809,3050,2434,38187
2,Western Cape,Mitchells Plain,All theft not mentioned elsewhere,5341,6093,6316,6803,6035,5761,6108,5514,4975,4043,3635,60624
3,Free State,Park Road,All theft not mentioned elsewhere,5108,4282,3834,3316,3101,3013,2679,3116,2927,2297,2103,35776
4,Gauteng,Pretoria Central,All theft not mentioned elsewhere,5099,4536,3309,2694,2616,2606,2635,3226,3246,2892,3030,35889


### Drop unwanted Category

In [9]:
crime_stats = crime_stats[crime_stats.Category != 'All theft not mentioned elsewhere']

In [10]:
crime_stats.head()

Unnamed: 0,Province,Station,Category,2005-2006,2006-2007,2007-2008,2008-2009,2009-2010,2010-2011,2011-2012,2012-2013,2013-2014,2014-2015,2015-2016,Total 2005-2016
11,Western Cape,Cape Town Central,Theft out of or from motor vehicle,3468,2924,2329,1856,2905,3051,3474,3294,3612,3441,3509,33863
12,Western Cape,Mitchells Plain,Drug-related crime,3064,3683,4792,5699,6571,6260,5850,6310,6044,4768,4609,57650
17,Kwazulu/Natal,Durban Central,Robbery with aggravating circumstances,2721,3214,2134,1966,1371,899,924,885,951,982,1024,17071
18,Western Cape,Mitchells Plain,Common assault,2657,2339,2131,2735,2749,2444,2810,2757,2185,1847,2079,26733
20,Gauteng,Jhb Central,Commercial crime,2585,2740,2844,2870,2867,2386,2475,1680,1296,1188,1137,24068


### Split data into Train and Test Sets

In [None]:
training_data = crime_stats.sample(frac=0.8, random_state=25)
testing_data = crime_stats.drop(training_data.index)

print(f"No. of training examples: {training_data.shape[0]}")
print(f"No. of testing examples: {testing_data.shape[0]}")

In [12]:
training_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 29718 entries, 11 to 30860
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Province         29718 non-null  object
 1   Station          29718 non-null  object
 2   Category         29718 non-null  object
 3   2005-2006        29718 non-null  int64 
 4   2006-2007        29718 non-null  int64 
 5   2007-2008        29718 non-null  int64 
 6   2008-2009        29718 non-null  int64 
 7   2009-2010        29718 non-null  int64 
 8   2010-2011        29718 non-null  int64 
 9   2011-2012        29718 non-null  int64 
 10  2012-2013        29718 non-null  int64 
 11  2013-2014        29718 non-null  int64 
 12  2014-2015        29718 non-null  int64 
 13  2015-2016        29718 non-null  int64 
 14  Total 2005-2016  29718 non-null  int64 
dtypes: int64(12), object(3)
memory usage: 3.6+ MB


### Add Classification Column

In [13]:
training_data.Category.unique()

array(['Theft out of or from motor vehicle', 'Drug-related crime',
       'Robbery with aggravating circumstances', 'Common assault',
       'Commercial crime', 'Burglary at residential premises',
       'Assault with the intent to inflict grievous bodily harm',
       'Theft of motor vehicle and motorcycle', 'Shoplifting',
       'Malicious damage to property', 'Common robbery',
       'Burglary at non-residential premises', 'Sexual Offences',
       'Driving under the influence of alcohol or drugs', 'Stock-theft',
       'Attempted murder', 'Carjacking',
       'Robbery at non-residential premises',
       'Robbery at residential premises', 'Murder',
       'Illegal possession of firearms and ammunition', 'Arson',
       'Truck hijacking', 'Robbery of cash in transit', 'Bank robbery',
       'Sexual offences as result of police action'], dtype=object)

In [None]:
def my_func(row):
    if row['Theft out of or from motor vehicle'] < 25:
        val = 'JUNIOR'
    elif row['Drug-related crime']  < 35:
        val = 'MID-LEVEL'
    else:
        val = 'SENIOR'
    return val

training_data['status'] = training_data.apply(my_func, axis=1)
training_data.head()


### Stemming and stop words

In [14]:
# Define a standard snowball stemmer
STEMMER = SnowballStemmer('english')
# Make a list of stopwords, including the stemmed versions
# These are words that have no impact on the classification, and
# can even occasionally mess up the classifier.
STOPWORDS = [
    'at',
    'as',
    'of',
    'the',
    'in',
    'or',
    'with',
    'from',
    'and',
    '&',
    '-s',
    'after',
    'for',
    'police',
    'result',
    'avoid',
    'hold',
    'holding',
    'retrieved',
    'battery',
    'fist',
    'of',
    'to',
    'a',
]
STOPWORDS += [STEMMER.stem(i) for i in STOPWORDS]
STOPWORDS = list(set(STOPWORDS))

### Tokenize

In [15]:
def tokenize(Category):
    """
    Takes SAP description text, strips out unwanted words and text,
    and prepares it for the trainer.
    """
    # first lower case and strip leading/trailing whitespace
    Category = Category.lower().strip()
    # kill the 'do-'s and any stray punctuation
    Category = Category.replace('do-', '').replace('.', '').replace(',', '')
    # make a list of words by splitting on whitespace
    words = Category.split(' ')
    # Make sure each "word" is a real string / account for odd whitespace
    words = [STEMMER.stem(i) for i in words if i]
    words = [i for i in words if i not in STOPWORDS]
    # let's see if adding bigrams improves the accuracy
    bigrams = ngrams(words, 2)
    bigrams = ["%s|%s" % (i[0], i[1]) for i in bigrams]
    # bigrams = [i for i in bigrams if STEMMED_BIGRAMS.get(i)]
    # set up a dict
    out_dict = dict([(i, True) for i in words + bigrams])
    # The NLTK trainer expects data in a certain format
    return out_dict

### Grab the features

In [None]:
# open our sample file and use the CSV module to parse it
# f = open('training_data.csv', 'rU')
data = list(csv.DictReader(training_data))
# Make an empty list for our processed data
features = []
# Loop through all the lines in the CSV
for i in data:
    Category = i.get('NARRATIVE')
    classification = i.get('classification')
    feats = tokenize(Category)
    features.append((feats, classification))

training_data.close()