# Exploratory Data Analysis for SFPD Web Contents

In [1]:
import pymongo
from pymongo import MongoClient

import pandas as pd
import numpy as np

import re

import plotly.express as px 

#### Connect to MongoDB

In [2]:
client = MongoClient('localhost', 27017)
db = client.schoolSpider
collection = db.outputItems

In [3]:
data = pd.DataFrame(list(collection.find()))
data.head()

Unnamed: 0,_id,url,text,school_id,depth,image_urls,file_urls,file_text,images,files
0,61a6873a99ee6c3a16cfb4ac,https://www.sanfranciscopolice.org/,Skip to main content File a Police Report onli...,130123000000.0,1,[https://www.sanfranciscopolice.org/themes/cus...,[],[],[{'url': 'https://www.sanfranciscopolice.org/s...,[]
1,61a6873a99ee6c3a16cfb4ae,https://www.sanfranciscopolice.org/your-sfpd/e...,Skip to main content File a Police Report onli...,130123000000.0,1,[https://www.sanfranciscopolice.org/themes/cus...,[],[],[{'url': 'https://www.sanfranciscopolice.org/s...,[]
2,61a6873a99ee6c3a16cfb4b0,https://www.sanfranciscopolice.org/your-sfpd/e...,Skip to main content File a Police Report onli...,130123000000.0,1,[https://www.sanfranciscopolice.org/themes/cus...,[],[],[{'url': 'https://www.sanfranciscopolice.org/s...,[]
3,61a6873a99ee6c3a16cfb4b2,https://www.sanfranciscopolice.org/your-sfpd/e...,Skip to main content File a Police Report onli...,130123000000.0,1,[https://www.sanfranciscopolice.org/themes/cus...,[],[],[{'url': 'https://www.sanfranciscopolice.org/s...,[]
4,61a6873a99ee6c3a16cfb4b4,https://www.sanfranciscopolice.org/your-sfpd/e...,Skip to main content File a Police Report onli...,130123000000.0,1,[https://www.sanfranciscopolice.org/themes/cus...,[],[],[{'url': 'https://www.sanfranciscopolice.org/s...,[]


#### Separate URL subpaths

In [4]:
subpaths = data.url.apply(lambda x: re.findall(r'https://www\.sanfranciscopolice\.org/(.*)', x)[0].split('/'))

subpaths_df = pd.DataFrame(subpaths.to_list()).add_prefix('level_')
subpaths_df

Unnamed: 0,level_0,level_1,level_2,level_3
0,,,,
1,your-sfpd,explore-department,professional-standards,
2,your-sfpd,explore-department,mounted-patrol,
3,your-sfpd,explore-department,k-9-unit,
4,your-sfpd,explore-department,investigations,
...,...,...,...,...
1451,news,park-station-newsletter-february-23-2018,,
1452,news,sfpd-kicks-summer-engagement-education-and-emp...,,
1453,news,sfpd-capture-two-outstanding-escapees-orange-c...,,
1454,news,park-station-newsletter-may-25-2018,,


In [5]:
data = data.join(subpaths_df)
data.head()

Unnamed: 0,_id,url,text,school_id,depth,image_urls,file_urls,file_text,images,files,level_0,level_1,level_2,level_3
0,61a6873a99ee6c3a16cfb4ac,https://www.sanfranciscopolice.org/,Skip to main content File a Police Report onli...,130123000000.0,1,[https://www.sanfranciscopolice.org/themes/cus...,[],[],[{'url': 'https://www.sanfranciscopolice.org/s...,[],,,,
1,61a6873a99ee6c3a16cfb4ae,https://www.sanfranciscopolice.org/your-sfpd/e...,Skip to main content File a Police Report onli...,130123000000.0,1,[https://www.sanfranciscopolice.org/themes/cus...,[],[],[{'url': 'https://www.sanfranciscopolice.org/s...,[],your-sfpd,explore-department,professional-standards,
2,61a6873a99ee6c3a16cfb4b0,https://www.sanfranciscopolice.org/your-sfpd/e...,Skip to main content File a Police Report onli...,130123000000.0,1,[https://www.sanfranciscopolice.org/themes/cus...,[],[],[{'url': 'https://www.sanfranciscopolice.org/s...,[],your-sfpd,explore-department,mounted-patrol,
3,61a6873a99ee6c3a16cfb4b2,https://www.sanfranciscopolice.org/your-sfpd/e...,Skip to main content File a Police Report onli...,130123000000.0,1,[https://www.sanfranciscopolice.org/themes/cus...,[],[],[{'url': 'https://www.sanfranciscopolice.org/s...,[],your-sfpd,explore-department,k-9-unit,
4,61a6873a99ee6c3a16cfb4b4,https://www.sanfranciscopolice.org/your-sfpd/e...,Skip to main content File a Police Report onli...,130123000000.0,1,[https://www.sanfranciscopolice.org/themes/cus...,[],[],[{'url': 'https://www.sanfranciscopolice.org/s...,[],your-sfpd,explore-department,investigations,


In [16]:
data.file_text.to_list()

[[],
 [],
 [],
 [],
 [],
 ['', '', '', '', '', '', '', '', '', '', '', ''],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 ['', '', ''],
 [],
 [],
 ['', ''],
 ['',
  'Bayview Station Newsletter \n\n 1 \n\n      Inside This Issue \nCaptain’s Message \n\n1 \n\nCommunity Meeting   2 \n\nOfficer of  the Month  3 \n\nDefinition of            \nSuspicious activity  \n\nDefinition of          \nCommunity Policing \n\nCrime Definition \n\nCommunity             \nEngagement  \n\nWeekly Recap \n\nSafety Tips - Crime \n\n4 \n\n5 \n\n6 \n\n7 \n\n8-9 \n\n10-19 \n\nSafety Tips - SFMTA  20-21 \n\nCrime Data Maps \n\nIncidents of Interest \n\nMonthly Comparison \nof Crime  \n\n22-24 \n\n25-28 \n\n29-30 \n\nSummaries of         \nSupreme Courts Case \nLaws \n\n31 \n\nDepartment News   \nRelease  \n\nIn the Community \n\nADs  w/ Police      \nPartners \n\n32-35 \n\n36-51 \n\n52-55 \n\nCommunity Feedback  56-57 \n\nCPAB Info \n\nVision and Values  \n\n \n\nContacts \n\n58 \n\n59-61 \n\n62 \n\n  Captain’s Message  \

## Visualize Website Hierarchy

In [6]:
subpaths_df.set_index(['level_0', 'level_1', 'level_2'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,level_3
level_0,level_1,level_2,Unnamed: 3_level_1
,,,
your-sfpd,explore-department,professional-standards,
your-sfpd,explore-department,mounted-patrol,
your-sfpd,explore-department,k-9-unit,
your-sfpd,explore-department,investigations,
...,...,...,...
news,park-station-newsletter-february-23-2018,,
news,sfpd-kicks-summer-engagement-education-and-employment-san,,
news,sfpd-capture-two-outstanding-escapees-orange-county-jail,,
news,park-station-newsletter-may-25-2018,,


In [7]:
fig1 = px.treemap(subpaths_df.fillna("."), path=['level_0', 'level_1', 'level_2'], color='level_0')
fig1.show(renderer="browser")

#### Looking at pages with subpages

In [8]:
subpaths_df_child = subpaths_df[~subpaths_df.level_1.isna()]

In [9]:
fig2 = px.treemap(subpaths_df_child.fillna('.'), path=['level_0', 'level_1', 'level_2'], color='level_0')
fig2.show(renderer="browser")

#### Ignore News Pages

In [10]:
subpaths_df_child_no_news = subpaths_df[(~subpaths_df.level_1.isna()) & (subpaths_df.level_0 != 'news')]

In [18]:
fig3 = px.treemap(subpaths_df_child_no_news.fillna('.'), path=['level_0', 'level_1', 'level_2'], color='level_0')
fig3.show(renderer="browser")