In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os
import csv

In [2]:
# Read the CSV file into a Pandas DataFrame

notes = pd.read_csv('crime_data_csv.csv')
notes.head()

Unnamed: 0,case_no,date_of_occurrence,primary_description,arrests,domestic,ward
0,JE123460,1/26/2021,0,0,1,5
1,JE123503,1/26/2021,1,0,1,20
2,JE123996,1/26/2021,2,0,0,48
3,JE123507,1/26/2021,1,0,1,21
4,JE123544,1/26/2021,3,0,0,27


In [3]:
help(pd.DataFrame.value_counts)

Help on function value_counts in module pandas.core.frame:

value_counts(self, subset: 'Optional[Sequence[Label]]' = None, normalize: 'bool' = False, sort: 'bool' = True, ascending: 'bool' = False)
    Return a Series containing counts of unique rows in the DataFrame.
    
    .. versionadded:: 1.1.0
    
    Parameters
    ----------
    subset : list-like, optional
        Columns to use when counting unique combinations.
    normalize : bool, default False
        Return proportions rather than frequencies.
    sort : bool, default True
        Sort by frequencies.
    ascending : bool, default False
        Sort in ascending order.
    
    Returns
    -------
    Series
    
    See Also
    --------
    Series.value_counts: Equivalent method on Series.
    
    Notes
    -----
    The returned Series will have a MultiIndex with one level per input
    column. By default, rows that contain any NA values are omitted from
    the result. By default, the resulting Series will be in d

In [4]:
# Assign the data to X and y
# Note: Sklearn requires a two-dimensional array of values
# so we use reshape() to create this

X = notes[['primary_description','domestic','ward']]
y = notes['arrests']

print("Shape: ", X.shape, y.shape)

Shape:  (80006, 3) (80006,)


In [5]:
print(X)

       primary_description  domestic  ward
0                        0         1     5
1                        1         1    20
2                        2         0    48
3                        1         1    21
4                        3         0    27
...                    ...       ...   ...
80001                    1         1    42
80002                    5         0     3
80003                    1         1     5
80004                    1         1     7
80005                    6         0    14

[80006 rows x 3 columns]


In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [7]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression()

In [8]:
classifier.fit(X_train, y_train)

LogisticRegression()

In [9]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.876424905006333
Testing Data Score: 0.8757124287571243


Make redictions

In [10]:
print(f'Actual:\t\t{list(y_test[:10])}')
print(f'Predicted:\t{list(classifier.predict(X_test[:10]))}')

Actual:		[0, 1, 0, 0, 0, 0, 0, 0, 0, 0]
Predicted:	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [11]:
crime_cluster = notes.drop(axis=0,columns=['case_no','date_of_occurrence'])
crime_cluster.head()

Unnamed: 0,primary_description,arrests,domestic,ward
0,0,0,1,5
1,1,0,1,20
2,2,0,0,48
3,1,0,1,21
4,3,0,0,27


In [12]:
count_of_columns = crime_cluster.count()
count_of_columns

primary_description    80006
arrests                80006
domestic               80006
ward                   80006
dtype: int64

In [13]:
count_of_columns = crime_cluster['ward'].count()
count_of_columns

80006

Analysis of Data after Running Logical Regression

In [14]:
new_csv = pd.read_csv('crime_data_csv.csv')
new_csv[new_csv['arrests'] == 1]['ward'].value_counts().to_csv('Ward_count_arrests.csv')
ward_count = pd.read_csv('Ward_count_arrests.csv')
ward_count = ward_count.rename(columns={'Unnamed: 0':'Ward_id','ward':'Arrests_counts'})
ward_count

Unnamed: 0,Ward_id,Arrests_counts
0,28,927
1,24,598
2,27,485
3,42,467
4,6,396
5,21,393
6,17,385
7,37,375
8,16,352
9,34,332


In [15]:
ward_avg = pd.read_csv('crime_data_csv.csv')
ward_avg[ward_avg['arrests'] == 0]['ward'].value_counts().to_csv('Ward_count_nonarrests.csv')
ward_count_non = pd.read_csv('Ward_count_nonarrests.csv')
ward_count_non = ward_count_non.rename(columns={'Unnamed: 0':'Ward_id','ward':'Non_arrest_counts'})
ward_count_non



Unnamed: 0,Ward_id,Non_arrest_counts
0,27,2943
1,28,2942
2,6,2754
3,42,2690
4,8,2478
5,24,2426
6,7,2350
7,20,2309
8,17,2254
9,21,2211


In [16]:
# Add prediected values onto the original dataframe
# combined_ward = pd.DataFrame(ward_count, columns=['Ward_id'])
# combined = ward_count_non.merge(combined_ward, how='right')
# combined_ward.head()
ward_count.merge(ward_count_non, on='Ward_id', how='inner')

Unnamed: 0,Ward_id,Arrests_counts,Non_arrest_counts
0,28,927,2942
1,24,598,2426
2,27,485,2943
3,42,467,2690
4,6,396,2754
5,21,393,2211
6,17,385,2254
7,37,375,1880
8,16,352,2055
9,34,332,1957
