# Inspecting a presentation with python-pptx

In [13]:
# Import libraries
from pptx import Presentation
import pandas as pd
import pptx
from pptx.util import Inches, Pt
from pptx.dml.color import RGBColor
# from pptx.enum.shapes import MSO_SHAPE
from pptx.enum.shapes import MSO_SHAPE_TYPE
# from pptx.enum.shapes import MSO_CONNECTOR
from pptx.enum.shapes import MSO_CONNECTOR_TYPE

# import placeholders
# from pptx.enum.shapes import PP_PLACEHOLDER
from pptx.enum.shapes import PP_PLACEHOLDER_TYPE

In [14]:
path = '../pres/Classification_Challenge.pptx'
prs = Presentation(path)

In [15]:
# Checking the slide content
number_of_slides = len(prs.slides)
print('Number of slides: ', number_of_slides)

Number of slides:  12


In [16]:
# Return length of slide layout
number_of_layouts = len(prs.slide_layouts)
print('Number of layouts: ', number_of_layouts)

Number of layouts:  25


In [19]:
for slide in prs.slides:
    print(f"----- Slide {prs.slides.index(slide)} -----")
    print(type(slide), slide.name, 'slide_index [',prs.slides.index(slide), ']' )
    for p in slide.placeholders:
        print('index [', p.placeholder_format.idx, ']', p.placeholder_format.type, p.name)

----- Slide 0 -----
<class 'pptx.slide.Slide'>  slide_index [ 0 ]
index [ 1 ] SUBTITLE (4) Subtitle 3
index [ 22 ] PICTURE (18) Picture Placeholder 7
----- Slide 1 -----
<class 'pptx.slide.Slide'>  slide_index [ 1 ]
index [ 0 ] TITLE (1) Title 2
index [ 17 ] BODY (2) Text Placeholder 1
index [ 21 ] BODY (2) Text Placeholder 3
----- Slide 2 -----
<class 'pptx.slide.Slide'>  slide_index [ 2 ]
index [ 0 ] TITLE (1) Title 2
index [ 17 ] BODY (2) Text Placeholder 1
index [ 21 ] BODY (2) Text Placeholder 3
----- Slide 3 -----
<class 'pptx.slide.Slide'>  slide_index [ 3 ]
index [ 0 ] TITLE (1) Title 2
index [ 17 ] BODY (2) Text Placeholder 1
index [ 21 ] BODY (2) Text Placeholder 3
----- Slide 4 -----
<class 'pptx.slide.Slide'>  slide_index [ 4 ]
index [ 0 ] TITLE (1) Title 2
index [ 17 ] BODY (2) Text Placeholder 1
index [ 21 ] BODY (2) Text Placeholder 3
----- Slide 5 -----
<class 'pptx.slide.Slide'>  slide_index [ 5 ]
index [ 0 ] TITLE (1) Title 2
index [ 17 ] BODY (2) Text Placeholder 1
i

In [31]:
# Create lists for slide headers and body
slide_titles = []
slide_body = []

# Extract slide text
for slide in prs.slides:
    title = None
    body = []

    for shape in slide.shapes:
        if not shape.has_text_frame:
            continue

        # Check if the shape is a title
        if shape.is_placeholder and shape.placeholder_format.idx == 0:
            title = shape.text_frame.text

        # If not a title, consider it as part of the body
        elif not title or (title and shape.text_frame.text != title):
            body.append(shape.text_frame.text)

    slide_titles.append(title if title else '')
    slide_body.append('\n'.join(body) if body else '')

# Print slide titles and body
for i, (title, body) in enumerate(zip(slide_titles, slide_body)):
    print(f"Slide {i + 1}:")
    print(f"Title: {title}")
    print(f"Body: {body}")
    print("\n")


Slide 1:
Title: 
Body: Classification Challenge  |   Nils Jennissen



Slide 2:
Title: ## Task
Body: 
Your company, DS Pros, would like to win a contract with a big city council as it would give us great PR. To do so you think it would be a great idea to proactively browse in the open data sets of this city (the one you choose, total freedom here) identify a situation that could be solved or improved using classification algorithms and present it to the technical office of that city council.

You need to prepare the following:

- A presentation describing the solution you try to solve, how classification will solve it and a summary of the solution proposed
- A well documented and visually appealing notebook where you try different models, explain the steps followed and chose one particular algorithm and hyperparameters (explaining why)
- You should also export that model, once trained, using pickle or similar so it can be reused.
- You should implement a .py script that loads the expor

In [29]:
slide_titles

['',
 '## Task',
 '## 0. Introduction & Proposal',
 '## 1. Key Features',
 '## 2. Benefits for the Council',
 '## 3. Setup and tool import',
 '## 4. The Data',
 '## 5. Data Preprocessing',
 '## 6. The Models',
 '## 6. The Models',
 '## 7. Evaluation',
 '## 8. Usage of the .py file for predictions']

In [32]:
slide_body

['Classification Challenge  |   Nils Jennissen\n',
 '\nYour company, DS Pros, would like to win a contract with a big city council as it would give us great PR. To do so you think it would be a great idea to proactively browse in the open data sets of this city (the one you choose, total freedom here) identify a situation that could be solved or improved using classification algorithms and present it to the technical office of that city council.\n\nYou need to prepare the following:\n\n- A presentation describing the solution you try to solve, how classification will solve it and a summary of the solution proposed\n- A well documented and visually appealing notebook where you try different models, explain the steps followed and chose one particular algorithm and hyperparameters (explaining why)\n- You should also export that model, once trained, using pickle or similar so it can be reused.\n- You should implement a .py script that loads the exported model, accepts a file with samples t

In [33]:
# Create a list for slide paragraphs
slide_paragraphs = []

# Extract paragraphs from each slide
for slide in prs.slides:
    slide_paras = []
    for shape in slide.shapes:
        if shape.has_text_frame:
            for paragraph in shape.text_frame.paragraphs:
                slide_paras.append(paragraph.text)
    slide_paragraphs.append('\n'.join(slide_paras) if slide_paras else '')

# Print slide paragraphs
for i, paragraphs in enumerate(slide_paragraphs):
    print(f"Slide {i + 1}:")
    print(f"Paragraphs: {paragraphs}")
    print("\n")

Slide 1:
Paragraphs: Classification Challenge  |   Nils Jennissen



Slide 2:
Paragraphs: 
## Task
Your company, DS Pros, would like to win a contract with a big city council as it would give us great PR. To do so you think it would be a great idea to proactively browse in the open data sets of this city (the one you choose, total freedom here) identify a situation that could be solved or improved using classification algorithms and present it to the technical office of that city council.

You need to prepare the following:

- A presentation describing the solution you try to solve, how classification will solve it and a summary of the solution proposed
- A well documented and visually appealing notebook where you try different models, explain the steps followed and chose one particular algorithm and hyperparameters (explaining why)
- You should also export that model, once trained, using pickle or similar so it can be reused.
- You should implement a .py script that loads the exported

### Datafram for a presentation

In [44]:
# Create lists for slide information
slide_titles = []
slide_body = []
slide_body_text = []

# Extract slide information
for slide in prs.slides:
    title = None
    body = []
    body_text = []

    for shape in slide.shapes:
        if not shape.has_text_frame:
            continue

        # Check if the shape is a title
        if shape.is_placeholder and shape.placeholder_format.idx == 0:
            title = shape.text_frame.text

        # If not a title, consider it as part of the body
        elif not title or (title and shape.text_frame.text != title):
            body.append(shape.text_frame.text)

            # Extract body text
            for paragraph in shape.text_frame.paragraphs:
                for run in paragraph.runs:
                    body_text.append(run.text)

    slide_titles.append(title if title else '')
    slide_body.append('\n'.join(body) if body else '')
    slide_body_text.append('\n'.join(body_text) if body_text else '')

# Create a dataframe with the slide information
df = pd.DataFrame({'Slide': slide_titles, 'Body': slide_body, 'Body_text': slide_body_text})

# Print the dataframe
print(df.head())

Empty DataFrame
Columns: [Slide, Body, Body_text]
Index: []
