In [1]:
import pandas as pd

import plotly.express as px

from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import mean_absolute_error,classification_report

from sklearn.linear_model import LinearRegression

from sklearn.tree import DecisionTreeClassifier

In [2]:
crimedf = pd.read_csv("C:\\Users\\user\\Desktop\\aiml\\Daily-Practice\\Day 9 - Practice Data Analysis\\crime_district.csv")
crimedf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19152 entries, 0 to 19151
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   state     19152 non-null  object
 1   district  19152 non-null  object
 2   category  19152 non-null  object
 3   type      19152 non-null  object
 4   date      19152 non-null  object
 5   crimes    19152 non-null  int64 
dtypes: int64(1), object(5)
memory usage: 897.9+ KB


In [3]:
#change data type of date from string
crimedf['date'] = pd.to_datetime(crimedf['date'])
#extract the year
crimedf['date'] = crimedf['date'].dt.year
crimedf['date'] = crimedf['date'].astype(int)

In [4]:
newcrimedf = crimedf.copy()

# to drop the first row from dataset which are the results of everything
drop_malaysia = newcrimedf[newcrimedf['state'] == 'Malaysia'].index
newcrimedf = newcrimedf.drop(drop_malaysia)

drop_state_totals = newcrimedf[newcrimedf['district'] == 'All'].index
newcrimedf = newcrimedf.drop(drop_state_totals)

drop_district_totals = newcrimedf[newcrimedf['type'] == 'all'].index
newcrimedf = newcrimedf.drop(drop_district_totals)

newcrimedf = newcrimedf.reset_index(drop=True)
# newcrimedf.to_csv('cleancrime.csv')

In [5]:
print(newcrimedf.shape)
newcrimedf.head()

(14976, 6)


Unnamed: 0,state,district,category,type,date,crimes
0,Johor,Batu Pahat,assault,causing_injury,2016,39
1,Johor,Batu Pahat,assault,causing_injury,2017,41
2,Johor,Batu Pahat,assault,causing_injury,2018,28
3,Johor,Batu Pahat,assault,causing_injury,2019,41
4,Johor,Batu Pahat,assault,causing_injury,2020,43


# Visual 2: Showing trends of crimes over the years

In [6]:
# totalling crimes by date and category                                 # reset index is jut so it starts with 0
crime_trend = (newcrimedf.groupby(['date', 'category'])['crimes'].sum().reset_index())


In [7]:
fig = px.line(
    crime_trend,
    x='date',
    y='crimes',
    color='category',
    markers=True, # for adding points
    hover_name='crimes', # whats bolded during hover 
    title='Crime Trend Over Year by Category'
    )

fig.update_layout(
    title_font_color= 'red',
    title_font_weight= 1000,
    xaxis_title='Year',
    yaxis_title='Number of Crimes',
    template='plotly_white')

fig.show()


# Visual 3: Bar chart of crimes by category

In [8]:
bargraph = px.bar(
    crime_trend, 
    x='date', 
    y='crimes', 
    color='category',
    barmode='group',
    title='Crime Counts in Malaysia (2016–2023): Assault vs Property')

bargraph.update_layout(
    title_font_color= 'red',
    title_font_weight= 1000,
    xaxis_title='Year',
    yaxis_title='Number of Crimes',
    template='plotly_white')
bargraph.show()

# Visual: top 5 district with highest crimes

In [9]:
top5_district_crime = (
    newcrimedf
    .groupby('district')['crimes']
    .sum()
    .reset_index()
    .sort_values('crimes',ascending=False)
    .head(5)
)

top5_district_crime

Unnamed: 0,district,crimes
108,Petaling Jaya,23078
24,Dang Wangi,19980
42,Kajang,19250
27,Gombak,19058
125,Sentul,16456


In [10]:
fig = px.bar(
    top5_district_crime,
    x='crimes',
    y='district',
    orientation='h',
    title='Top 5 Districts with Highest Crime Counts',
    text='crimes',
    pattern_shape= 'district',
    color='district'
    
)

fig.update_layout(
    xaxis_title='Total Crimes',
    yaxis_title='District',
    template='plotly_white'
)

fig.show()


# machine learning part!

# using linear regression

In [11]:
X = newcrimedf[['date','category', 'type']]
X_encoded = pd.get_dummies(X, drop_first=True)

y = newcrimedf['crimes']


# to separate train and test before certain years, cause its time sensitive and i want
# it to learn over the time!
X_train = X_encoded[X_encoded['date']<=2020] 
X_test = X_encoded[X_encoded['date']>2020] 
y_train= y[X_encoded['date']<=2020]
y_test = y[X_encoded['date']>2020]

model_lr = LinearRegression()
model_lr.fit(X_train,y_train)

y_pred = model_lr.predict(X_test)
y_pred

mae = mean_absolute_error(y_test,y_pred)
print('MAE:',mae)

MAE: 38.458645589828066


In [12]:
coefficients = pd.Series(
    model_lr.coef_,
    index=X_encoded.columns
).sort_values()

coefficients


type_theft_vehicle_lorry        -92.124359
type_theft_vehicle_motorcar     -55.543590
type_robbery_solo_armed         -27.442788
type_robbery_gang_armed         -27.303045
type_murder                     -25.294071
type_rape                       -16.360737
date                             -5.832158
type_robbery_solo_unarmed        -2.590224
type_causing_injury               3.326442
type_robbery_gang_unarmed        19.011058
type_theft_other                 22.376923
type_theft_vehicle_motorcycle    64.653846
category_property                76.653365
dtype: float64

In [13]:
coef_df = (
    pd.Series(model_lr.coef_, index=X_encoded.columns)
    .sort_values()
    .reset_index()
)

coef_df.columns = ['feature', 'coefficient']
coef_df


Unnamed: 0,feature,coefficient
0,type_theft_vehicle_lorry,-92.124359
1,type_theft_vehicle_motorcar,-55.54359
2,type_robbery_solo_armed,-27.442788
3,type_robbery_gang_armed,-27.303045
4,type_murder,-25.294071
5,type_rape,-16.360737
6,date,-5.832158
7,type_robbery_solo_unarmed,-2.590224
8,type_causing_injury,3.326442
9,type_robbery_gang_unarmed,19.011058


In [14]:
import plotly.express as px

top_coef = pd.concat([
    coef_df.head(10),      # most negative
    coef_df.tail(10)       # most positive
])

fig = px.bar(
    top_coef,
    x='coefficient',
    y='feature',
    orientation='h',
    title='Top Features Influencing Crime Counts (Linear Regression)'
)

fig.update_layout(
    template='plotly_white',
    xaxis_title='Coefficient Value',
    yaxis_title='Feature'
)

fig.show()


In [15]:
# encoding the state using map

state_codes = {
    'Johor': 0, 
    'Kedah': 1, 
    'Kelantan': 2, 
    'Melaka': 3, 
    'Negeri Sembilan':4,
    'Pahang': 5, 
    'Perak':6, 
    'Perlis':7, 
    'Pulau Pinang':8, 
    'Sabah':9, 
    'Sarawak':10,
    'Selangor':11, 
    'Terengganu':12, 
    'W.P. Kuala Lumpur':13   
}

newcrimedf['state_encoded'] = newcrimedf['state'].map(state_codes)

#encoding district using factorize

district_factorized  = pd.factorize(newcrimedf['district'])
newcrimedf['district_encoded'] = district_factorized[0]

# using sklearn labelencoder method for type and category
type_encoder = LabelEncoder()
category_encoder = LabelEncoder()

newcrimedf['type_sklearn_le'] = type_encoder.fit_transform(newcrimedf['type'])
newcrimedf['category_sklearn_le'] = category_encoder.fit_transform(newcrimedf['category'])
newcrimedf

Unnamed: 0,state,district,category,type,date,crimes,state_encoded,district_encoded,type_sklearn_le,category_sklearn_le
0,Johor,Batu Pahat,assault,causing_injury,2016,39,0,0,1,0
1,Johor,Batu Pahat,assault,causing_injury,2017,41,0,0,1,0
2,Johor,Batu Pahat,assault,causing_injury,2018,28,0,0,1,0
3,Johor,Batu Pahat,assault,causing_injury,2019,41,0,0,1,0
4,Johor,Batu Pahat,assault,causing_injury,2020,43,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
14971,W.P. Kuala Lumpur,Wangsa Maju,property,theft_vehicle_motorcycle,2019,755,13,158,11,1
14972,W.P. Kuala Lumpur,Wangsa Maju,property,theft_vehicle_motorcycle,2020,646,13,158,11,1
14973,W.P. Kuala Lumpur,Wangsa Maju,property,theft_vehicle_motorcycle,2021,403,13,158,11,1
14974,W.P. Kuala Lumpur,Wangsa Maju,property,theft_vehicle_motorcycle,2022,369,13,158,11,1


In [19]:
X2 = newcrimedf[
    ['date']        
]

y2 = newcrimedf['crimes']


X2_train, X2_test, y2_train, y2_test= X2[X2['date']<=2020], X2[X2['date']>2020], y2[X2['date']<=2020], y2[X2['date']>2020]

model_dt = DecisionTreeClassifier(max_depth=8, random_state=42)
model_dt.fit(X2_train,y2_train)

y2_pred_dt = model_dt.predict(X2_test)

cr_dt = classification_report(y2_test,y2_pred_dt)
print(cr_dt)

              precision    recall  f1-score   support

           0       0.28      1.00      0.44      1590
           1       0.00      0.00      0.00       471
           2       0.00      0.00      0.00       315
           3       0.00      0.00      0.00       236
           4       0.00      0.00      0.00       196
           5       0.00      0.00      0.00       130
           6       0.00      0.00      0.00       125
           7       0.00      0.00      0.00       130
           8       0.00      0.00      0.00       118
           9       0.00      0.00      0.00        99
          10       0.00      0.00      0.00        85
          11       0.00      0.00      0.00        73
          12       0.00      0.00      0.00        87
          13       0.00      0.00      0.00        57
          14       0.00      0.00      0.00        70
          15       0.00      0.00      0.00        71
          16       0.00      0.00      0.00        50
          17       0.00    


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.

