In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_validate

In [2]:
data = pd.read_excel('Suicide Prevention Data.xlsx')

In [3]:
data.shape

(15093, 26)

In [4]:
data_2 = data.dropna(subset=['SuicideReferralFlagName'])

In [5]:
data_2.shape

(11441, 26)

In [6]:
nan_summary = (data_2.isna().sum()*100/len(data_2))
print(nan_summary)

CompletionStatus                     0.000000
SuicideReferralFlagName              0.000000
LevelOfCareD8Calculated_PHN         15.260904
LevelOfCareAssessed_Prac            85.630627
GenderName                           2.106459
CountryOfBirthName                   3.338869
ProficiencySpokenEnglishName         2.176383
ATSIStatusName                      15.750371
LabourForceName                     11.117909
HomelessnessStatusName               2.989249
MaritalStatusName                   10.462372
PrincipalFocusName                   0.043702
PrincipalDiagnosisName               2.167643
AdditionalDiagnosisName              1.931649
LGBTIStatus                         52.093348
SexualityName                       99.790228
ContinuityOfCareClient               3.994406
PerinatalFlagName                   83.288174
OutOfHomeCareName                   90.708854
FamilyDomesticViolenceFlagName      84.546805
ComorbidAlcoholDrugConditionName    81.435189
Service Contact Count Range       

In [7]:
#Convert object datatype (string) to categorical in numerical format

def convertCategoricalData(dataframe):
    for column in dataframe.columns:
        if dataframe[column].dtype == "object":
            dataframe[column] = dataframe[column].astype("category").cat.codes
    return dataframe

In [8]:
data_3 = convertCategoricalData(data_2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[column] = dataframe[column].astype("category").cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[column] = dataframe[column].astype("category").cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[column] = dataframe[column].astype("category").cat.codes
A va

In [9]:
features = data_3.drop("SuicideReferralFlagName", axis = 1)
classLabels = data_3["SuicideReferralFlagName"]

In [10]:
#Getting random sample from the dataset? shuffle data in any order
#random_state = 0 to make sure the shuffled data is the same the next time the code is ran -> reproducable
features = features.sample(frac = 1, random_state=0)
classLabels = classLabels.sample(frac = 1, random_state=0)

In [11]:
#verify the indices
assert all(features.index == classLabels.index)
x=round(0.8*len(data_3)) #Make a variable that is 80% of dataset
trainFeatures, trainClassLabels = features.iloc[:x], classLabels.iloc[:x]
testFeatures, testClassLabels = features.iloc[x:len(data)], classLabels.iloc[x:len(data)]

In [12]:
classLabels.head()

3790     0
10266    0
5119     0
3588     0
5340     0
Name: SuicideReferralFlagName, dtype: int8

In [13]:
print(trainFeatures.shape)
print(testFeatures.shape)

(9153, 25)
(2288, 25)


In [14]:
treeLearner = DecisionTreeClassifier(random_state=0)

#Train the model
classifier = treeLearner.fit(trainFeatures, trainClassLabels)

#Use the model to predict on test set
predictions = classifier.predict(testFeatures)

In [15]:
predictions

array([0, 1, 1, ..., 0, 0, 0], dtype=int8)

In [16]:
from collections import Counter
# Use Counter to count occurrences of each category
predictions_counts = Counter(predictions)

# Print the summary
for category, count in predictions_counts.items():
    print(f"{category}: {count}")

0: 1974
1: 314


# Accuracy

In [17]:
def computeAccuracy (target, predicted):
    accuracy = (predictions == target).sum()/len(target)
    return accuracy

In [18]:
accuracy = computeAccuracy (testClassLabels, predictions)
print("Accuracy of the model= ", accuracy)

Accuracy of the model=  0.8557692307692307


In [19]:
from sklearn.tree import export_graphviz
import graphviz

#Convert class labels to a list of strings
class_labels = trainClassLabels.astype(str).unique().tolist()

#Export the decision tree in DOT format
dot_data = export_graphviz (treeLearner, out_file=None,
                           feature_names=trainFeatures.columns,
                            class_names=class_labels,
                            filled=True, rounded=True,
                            special_characters=True)

graph= graphviz.Source(dot_data)
graph.render("decision_tree") # Save the tree as a PDF file
graph.view() # Display the tree in a GUI window

'decision_tree.pdf'

# Evaluate with cross-validation

In [20]:
from sklearn.model_selection import cross_validate, cross_val_score, KFold

In [21]:
evalResults = cross_validate(treeLearner,X=features,y=classLabels,cv=10,scoring=["accuracy"])

In [22]:
print(evalResults)

{'fit_time': array([0.07654047, 0.08467817, 0.0827949 , 0.07182217, 0.07162046,
       0.07339191, 0.07299924, 0.06907439, 0.0665853 , 0.06742048]), 'score_time': array([0.00392175, 0.00411963, 0.00863743, 0.00352287, 0.00273323,
       0.00595188, 0.00311017, 0.00349307, 0.00373769, 0.0030148 ]), 'test_accuracy': array([0.88034934, 0.86188811, 0.84178322, 0.86800699, 0.85926573,
       0.86451049, 0.85664336, 0.86800699, 0.86625874, 0.84440559])}


In [23]:
cv = KFold(n_splits=10)
# cross_validate also allows to specify metrics which you want to see
for i, score in enumerate(cross_validate(treeLearner, X=features,y=classLabels, cv=cv)["test_score"]):
    print(f"Accuracy for the fold no. {i} on the test set: {score}")

Accuracy for the fold no. 0 on the test set: 0.8707423580786027
Accuracy for the fold no. 1 on the test set: 0.8784965034965035
Accuracy for the fold no. 2 on the test set: 0.847027972027972
Accuracy for the fold no. 3 on the test set: 0.8758741258741258
Accuracy for the fold no. 4 on the test set: 0.8548951048951049
Accuracy for the fold no. 5 on the test set: 0.8697552447552448
Accuracy for the fold no. 6 on the test set: 0.8636363636363636
Accuracy for the fold no. 7 on the test set: 0.8557692307692307
Accuracy for the fold no. 8 on the test set: 0.8618881118881119
Accuracy for the fold no. 9 on the test set: 0.8452797202797203


# Use Prediction Model on missing suicidal flags data

In [24]:
data_na_ori = data.loc[data['SuicideReferralFlagName'].isna()]

In [25]:
data_na_ori.shape

(3652, 26)

In [26]:
nan_summary_na = (data_na_ori.isna().sum()*100/len(data_na_ori))
print(nan_summary_na)

CompletionStatus                     23.986857
SuicideReferralFlagName             100.000000
LevelOfCareD8Calculated_PHN          23.986857
LevelOfCareAssessed_Prac             82.694414
GenderName                            2.026287
CountryOfBirthName                    3.751369
ProficiencySpokenEnglishName          1.506024
ATSIStatusName                       14.841183
LabourForceName                      93.838992
HomelessnessStatusName               40.279299
MaritalStatusName                    93.373494
PrincipalFocusName                   90.005476
PrincipalDiagnosisName               54.819277
AdditionalDiagnosisName              69.660460
LGBTIStatus                          54.846659
SexualityName                        99.863089
ContinuityOfCareClient               94.058050
PerinatalFlagName                    97.727273
OutOfHomeCareName                    99.041621
FamilyDomesticViolenceFlagName       98.795181
ComorbidAlcoholDrugConditionName     97.864184
Service Conta

In [27]:
data_na = convertCategoricalData(data_na_ori)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[column] = dataframe[column].astype("category").cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[column] = dataframe[column].astype("category").cat.codes
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe[column] = dataframe[column].astype("category").cat.codes
A va

In [28]:
features_na = data_na.drop("SuicideReferralFlagName", axis = 1)
classLabels_na = data_na["SuicideReferralFlagName"]

In [29]:
#verify the indices
assert all(features_na.index == classLabels_na.index)
Features_topredict, ClassLabels_topredict = features_na.iloc[:len(data_na)], classLabels_na.iloc[:len(data_na)]

In [30]:
#Use the model to predict on test set
predictions_na = classifier.predict(Features_topredict)

In [31]:
print(predictions_na)

[1 1 1 ... 1 1 1]


In [32]:
data_na_ori = data.loc[data['SuicideReferralFlagName'].isna()]

# Add a new column named 'Predictions' to the data_na DataFrame
data_na_ori['Predictions'] = predictions_na

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_na_ori['Predictions'] = predictions_na


In [33]:
data_na_ori.to_excel(r'C:\Users\MiloJoronen\OneDrive - CESPHN\Documents\Testing\LaTrobe\suicide_data_prevention_predicted.xlsx', index=False)