<a href="https://colab.research.google.com/github/lukasg1/dataexploration/blob/main/data_exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Data Exploration Project
Dataset: "Bullying in Schools", Author: Lukas Großerhode

### 1. Load the data and libraries

In [1]:
!wget "https://github.com/lukasg1/dataexploration/raw/main/21nn-model-manhattan.zip" --quiet
!unzip 21nn-model-manhattan.zip

Archive:  21nn-model-manhattan.zip
   creating: 354d6d59f4b04d43a24c16059a6569d4/
   creating: 354d6d59f4b04d43a24c16059a6569d4/metrics/
   creating: 354d6d59f4b04d43a24c16059a6569d4/artifacts/
  inflating: 354d6d59f4b04d43a24c16059a6569d4/.DS_Store  
  inflating: __MACOSX/354d6d59f4b04d43a24c16059a6569d4/._.DS_Store  
   creating: 354d6d59f4b04d43a24c16059a6569d4/tags/
   creating: 354d6d59f4b04d43a24c16059a6569d4/params/
  inflating: 354d6d59f4b04d43a24c16059a6569d4/meta.yaml  
  inflating: 354d6d59f4b04d43a24c16059a6569d4/metrics/false_negative  
  inflating: 354d6d59f4b04d43a24c16059a6569d4/metrics/false_positive  
  inflating: 354d6d59f4b04d43a24c16059a6569d4/metrics/accuracy  
  inflating: 354d6d59f4b04d43a24c16059a6569d4/metrics/true_negative  
  inflating: 354d6d59f4b04d43a24c16059a6569d4/metrics/true_positive  
   creating: 354d6d59f4b04d43a24c16059a6569d4/artifacts/knn_model/
  inflating: 354d6d59f4b04d43a24c16059a6569d4/tags/mlflow.user  
  inflating: 354d6d59f4b04d43a24c160

In [49]:
!pip install pandas --quiet
!pip install mlflow --quiet
!pip install plotly --quiet
!pip install scipy==1.10.0 --quiet
!pip install scikit-learn==1.2.1 --quiet

In [50]:
import pandas as pd
import mlflow
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from sklearn.metrics import accuracy_score, confusion_matrix

In [8]:
df_cleaned = pd.read_csv("https://raw.githubusercontent.com/lukasg1/dataexploration/main/data/Bullying_2018_cleaned.csv")
df_cleaned.head(10)

Unnamed: 0,Custom_Age,Sex,Physically_attacked,Physical_fighting,Felt_lonely,Close_friends,Miss_school_no_permission,Other_students_kind_and_helpful,Parents_understand_problems,Most_of_the_time_or_always_felt_lonely,Missed_classes_or_school_without_permission,Bullied_at_least_once
0,13 years old,Female,0 times,0 times,Always,2,10 or more days,Never,Always,Yes,Yes,1
1,13 years old,Female,0 times,0 times,Never,3 or more,0 days,Sometimes,Always,No,No,0
2,14 years old,Male,0 times,0 times,Never,3 or more,0 days,Sometimes,Always,No,No,0
3,13 years old,Female,0 times,0 times,Rarely,3 or more,0 days,Most of the time,Most of the time,No,No,0
4,13 years old,Male,0 times,1 time,Never,3 or more,0 days,Most of the time,Always,No,No,0
5,14 years old,Female,1 time,0 times,Sometimes,3 or more,0 days,Most of the time,Always,No,No,0
6,12 years old,Female,0 times,0 times,Rarely,3 or more,0 days,Most of the time,Never,No,No,0
7,13 years old,Male,1 time,2 or 3 times,Never,3 or more,6 to 9 days,Most of the time,Most of the time,No,Yes,0
8,14 years old,Female,0 times,0 times,Always,0,0 days,Sometimes,Never,Yes,No,1
9,15 years old,Male,0 times,1 time,Never,3 or more,3 to 5 days,Most of the time,Always,No,Yes,0


In [5]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51910 entries, 0 to 51909
Data columns (total 12 columns):
 #   Column                                       Non-Null Count  Dtype 
---  ------                                       --------------  ----- 
 0   Custom_Age                                   51910 non-null  object
 1   Sex                                          51910 non-null  object
 2   Physically_attacked                          51910 non-null  object
 3   Physical_fighting                            51910 non-null  object
 4   Felt_lonely                                  51910 non-null  object
 5   Close_friends                                51910 non-null  object
 6   Miss_school_no_permission                    51910 non-null  object
 7   Other_students_kind_and_helpful              51910 non-null  object
 8   Parents_understand_problems                  51910 non-null  object
 9   Most_of_the_time_or_always_felt_lonely       51910 non-null  object
 10  Missed_cla

### 2. Data Visualization

In [6]:
'''presenting the percentage of bullied via pie chart'''
labels = ['Yes', 'No']
values = df_cleaned['Bullied_at_least_once'].value_counts()
colors = ['red','2E8B57']

fig = go.Figure(data=[go.Pie(labels=labels, values=values, marker=dict(colors=colors))])
fig.show()

In [7]:
'''presenting the distrubution of bullied in relation to sex via pie chart'''
male_count = len(df_cleaned[df_cleaned['Sex'] == 'Male'])
female_count = len(df_cleaned[df_cleaned['Sex'] == 'Female'])

labels = ['Male', 'Female']
values = [male_count, female_count]
colors = ['blue', 'red']

fig = go.Figure(data=[go.Pie(labels=labels, values=values, marker=dict(colors=colors))])
fig.show()

In [20]:
'''presenting the distribution of age and bullied via line chart'''
age_counts = df_cleaned.loc[df_cleaned['Bullied_at_least_once'] == 1, 'Custom_Age'].value_counts().sort_index()

trace1 = go.Scatter(x=age_counts.index, y=age_counts.values, name='Age')
layout = go.Layout(title='Distribution of age and number of bullying victims',
                   xaxis_title='Age',
                   yaxis_title='Number')

fig = go.Figure(data=[trace1], layout=layout)
fig.show()

In [21]:
'''presenting the number of friends and bullied via line chart'''
friends_counts = df_cleaned.loc[df_cleaned['Bullied_at_least_once'] == 1, 'Close_friends'].value_counts().sort_index()

trace1 = go.Scatter(x=friends_counts.index, y=friends_counts.values, name='Friends')
layout = go.Layout(title='Distribution of close friends and number of bullying victims',
                   xaxis_title='Number of close friends',
                   yaxis_title='Number of bullying victims')

fig = go.Figure(data=[trace1], layout=layout)
fig.show()

In [26]:
'''presenting if parents understand problems via bar chart'''
understand_counts = df_cleaned.loc[df_cleaned['Bullied_at_least_once'] == 1, 'Parents_understand_problems'].value_counts().sort_index()
understand_df = pd.DataFrame({'parents_understand_problems': understand_counts.index,
                              'number_of_bullying_victims': understand_counts.values})
understand_df = understand_df.sort_values('number_of_bullying_victims', ascending=False)

trace1 = go.Bar(x=understand_df['parents_understand_problems'], 
                y=understand_df['number_of_bullying_victims'],
                name='Age')
layout = go.Layout(title='Distribution of age and number of bullying victims',
                    xaxis_title='How often do your parents understand your personal problems?',
                    yaxis_title='Number of bullying victims')

fig = go.Figure(data=[trace1], layout=layout)
fig.show()

### 3. Evaluation of the model on the test set

In [7]:
# read CSV and load 21nn model
df_testing = pd.read_csv('https://raw.githubusercontent.com/lukasg1/dataexploration/main/data/Bullying_2018_test_data.csv')
path = '/content/354d6d59f4b04d43a24c16059a6569d4/artifacts/knn_model'
loaded_model = mlflow.pyfunc.load_model(path)

 - numpy (current: 1.22.4, required: numpy==1.23.5)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


In [21]:
# extract features and target variable from test data
X_test = df_testing.drop('Bullied_at_least_once', axis=1)
y_test = df_testing['Bullied_at_least_once']

# predict on test data
y_pred = loaded_model.predict(X_test)

# show results
results = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(results)

      Actual  Predicted
0        1.0        0.0
1        1.0        0.0
2        1.0        1.0
3        0.0        0.0
4        1.0        0.0
...      ...        ...
5186     0.0        0.0
5187     0.0        0.0
5188     0.0        1.0
5189     1.0        1.0
5190     0.0        0.0

[5191 rows x 2 columns]


In [22]:
# calculating accuracy
accuracy = accuracy_score(y_test, y_pred)
print(accuracy)

0.6420728183394336


In [62]:
# calculating confusion matrix (values)
matrix = confusion_matrix(y_test, y_pred)
print(matrix)

matrix = matrix[[1,0], :]

class_names_real = ['actually_not_bullied', 'actually_bullied']
class_names_pred = ['pred_bullied', 'pred_not_bullied']

fig = ff.create_annotated_heatmap(
    z=matrix, 
    x=class_names_real, 
    y=class_names_pred, 
    colorscale='Blues',
    showscale=True,
)

fig.update_layout(
    title='Confusion Matrix',
    xaxis_title='Reality',
    yaxis_title='Prediction',
    width=500,
    height=500,
)

fig.show()

[[2587  509]
 [1349  746]]


In [68]:
# calculating confusion matrix (percentage)
matrix = confusion_matrix(y_test, y_pred)

total = sum(sum(matrix))
percent_matrix = matrix / total * 100
print(percent_matrix)
matrix = matrix[[1, 0], :]
percentages = matrix / total * 100

fig = ff.create_annotated_heatmap(
    z=percentages, 
    x=class_names_real, 
    y=class_names_pred, 
    colorscale='Blues',
    showscale=True,
    zmin=0,
    zmax=100,
    annotation_text=percentages.round(1),
    text=percentages.round(1),
    hoverinfo='text'
)

fig.update_layout(
    title='Confusion Matrix',
    xaxis_title='Reality',
    yaxis_title='Prediction',
    width=500,
    height=500,
)

fig.show()


[[49.83625506  9.80543248]
 [25.98728569 14.37102678]]
