# Logistic Regression Model for 2016 data

### Logisitic Regression predicts binary outcomes. This model will analyze the available data, and when presented a new sample, mathematically determine its probability of belonging to a class. If the probability is above a certain cutoff point, the sample is assigned to that class. If the probability is less than the cutoff point, the sample is assigned to the other class.

#### For our project, we will show how machine learning can help predict the safety of cities throughout the state of North Carolina. 

In [1]:
# Import dependencies
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression

In [2]:
# Read the database
df = pd.read_csv(Path('NC_Crime_data_2016.csv'))
df.head()

Unnamed: 0,City,Population,Murder_ and_nonnegligent_manslaughter,Rape\n,Robbery,Aggravated_assault,Burglary,Larceny_theft,Violent_crime_total,Motor_vehicle_theft,Property_crime_total,Arson,total_crime,Crime_index,is_safe
0,Aberdeen,7549,0,3,10,12,52,226,25,11,289,1,314,4.15,1
1,Ahoskie,4883,2,3,8,30,81,201,43,9,291,0,334,6.84,0
2,Albemarle,16024,1,6,24,76,201,538,107,32,771,4,878,5.48,1
3,Angier,5097,0,2,2,10,62,59,14,8,129,3,143,2.81,1
4,Apex,47324,1,2,13,24,83,501,40,17,601,1,641,1.35,1


In [3]:
# Identifying the data types of each column
dtypes_2016 = df.dtypes
print(dtypes_2016)

City                                      object
Population                                 int64
Murder_ and_nonnegligent_manslaughter      int64
Rape\n                                     int64
Robbery                                    int64
Aggravated_assault                         int64
Burglary                                   int64
Larceny_theft                              int64
Violent_crime_total                        int64
Motor_vehicle_theft                        int64
Property_crime_total                       int64
Arson                                      int64
total_crime                                int64
Crime_index                              float64
is_safe                                    int64
dtype: object


In [4]:
# Cleaing the DataFrame
# Dropping Violent_crime_total & Property_Crime_total columns
# These columns are subtotals, we don't want to double count the data so we are dropping them
cleaned_df = df.drop(['Violent_crime_total','Property_crime_total', 'City'], axis=1)
cleaned_df.head()

Unnamed: 0,Population,Murder_ and_nonnegligent_manslaughter,Rape\n,Robbery,Aggravated_assault,Burglary,Larceny_theft,Motor_vehicle_theft,Arson,total_crime,Crime_index,is_safe
0,7549,0,3,10,12,52,226,11,1,314,4.15,1
1,4883,2,3,8,30,81,201,9,0,334,6.84,0
2,16024,1,6,24,76,201,538,32,4,878,5.48,1
3,5097,0,2,2,10,62,59,8,3,143,2.81,1
4,47324,1,2,13,24,83,501,17,1,641,1.35,1


In [5]:
# Splitting the Dataset into Train and Test Sets
# Creating our features
X = cleaned_df.drop(['is_safe'], axis=1)
X.head()

Unnamed: 0,Population,Murder_ and_nonnegligent_manslaughter,Rape\n,Robbery,Aggravated_assault,Burglary,Larceny_theft,Motor_vehicle_theft,Arson,total_crime,Crime_index
0,7549,0,3,10,12,52,226,11,1,314,4.15
1,4883,2,3,8,30,81,201,9,0,334,6.84
2,16024,1,6,24,76,201,538,32,4,878,5.48
3,5097,0,2,2,10,62,59,8,3,143,2.81
4,47324,1,2,13,24,83,501,17,1,641,1.35


In [6]:
# Creating our target
y = cleaned_df['is_safe']
y.head()

0    1
1    0
2    1
3    1
4    1
Name: is_safe, dtype: int64

In [7]:
X.describe()

Unnamed: 0,Population,Murder_ and_nonnegligent_manslaughter,Rape\n,Robbery,Aggravated_assault,Burglary,Larceny_theft,Motor_vehicle_theft,Arson,total_crime,Crime_index
count,182.0,182.0,182.0,182.0,182.0,182.0,182.0,182.0,182.0,182.0,182.0
mean,18838.131868,1.467033,4.813187,27.005495,64.087912,144.373626,514.67033,36.483516,3.258242,792.967033,4.226538
std,70201.579213,5.874047,18.578138,160.5276,322.840164,531.463873,2119.694983,208.470747,16.986973,3355.770919,2.523421
min,320.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.38
25%,2495.75,0.0,0.0,1.0,3.0,14.0,52.25,2.0,0.0,90.0,2.4
50%,4992.5,0.0,1.0,3.0,12.0,44.0,130.5,6.5,0.0,194.0,3.89
75%,14834.25,1.0,3.0,13.0,34.0,117.0,373.0,19.0,2.75,566.75,5.4375
max,896379.0,67.0,219.0,2121.0,4153.0,6691.0,27280.0,2761.0,220.0,43292.0,15.64


In [8]:
y.value_counts()

1    146
0     36
Name: is_safe, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [10]:
# Instantiate the Logistic Regression Model
classifier = LogisticRegression(solver='lbfgs', random_state=1)
classifier

LogisticRegression(random_state=1)

In [11]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
   intercept_scaling=1, max_iter=100, multi_class='auto', penalty='12',
   random_state=1, solver='lbfgs', warm_start=False)

LogisticRegression(penalty='12', random_state=1)

In [12]:
# Train the model
classifier.fit(X_train, y_train)

LogisticRegression(random_state=1)

In [13]:
# Validate the Logistics Regression Model
predictions = classifier.predict(X_test)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
85,1,1
150,1,1
101,1,1
31,1,1
159,1,1
161,1,1
163,1,1
27,1,1
69,0,0
11,1,1


In [14]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, predictions)

1.0