In [1]:
import os
import logging
import docx
# import pdfkit
import pandas as pd
from tqdm import tqdm

import numpy as np
from glob import glob
from typing import Callable, List
from datetime import datetime
import pandas as pd

In [2]:
# !which python

In [3]:
def get_file_list(data_path: str) -> List[str]:
    separator = os.sep
    return glob(separator.join([data_path, "*.xlsx"]))


def read_data(data_path: str) -> pd.DataFrame:
    files_xls = get_file_list(data_path)
    df = pd.DataFrame()
    logging.info("Extracting data from {} files".format(len(files_xls)))
    for file in tqdm(files_xls):
        data = pd.read_excel(file, header=None, names=["tekst", "wartosc"])
        df = df.append(data)
    return df


def export_to_docx(data: pd.DataFrame, destination: str, n_samples: int) -> None:
    doc = docx.Document()
    t = doc.add_table(data.shape[0] + 1, data.shape[1])

    fname = "".join(["results", ".docx"])
    path = os.path.join(destination, fname)

    logging.info(
        "Writing {} data samples to Word document in {}".format(n_samples, destination)
    )

    # add the header rows.
    for j in range(data.shape[-1]):
        t.cell(0, j).text = data.columns[j]

    # add the rest of the data frame
    for i in tqdm(range(data.shape[0])):
        for j in range(data.shape[-1]):
            t.cell(i + 1, j).text = str(data.values[i, j])

    doc.save(path)


def export_to_pdf(data: pd.DataFrame, destination: str) -> None:

    logging.info("Writing data to PDF document in {}".format(destination))
    fname = "".join(["results", ".pdf"])
    path = os.path.join(destination, fname)
    pdfkit.from_string(data.to_html(), path)

In [9]:
def get_samples(data: pd.DataFrame, n_samples: str) -> pd.DataFrame:
    n_samples = int(n_samples)
    samples = data.sample(n_samples)
    return samples

In [4]:
data_path = "C:\\Users\\Michal\\Downloads"
n_samples = 100
filter_value = 5

In [35]:
data_path

'C:\\Users\\Michal\\Downloads'

In [38]:
separator = os.sep
glob(separator.join([data_path, "*.xlsx"]))

['C:\\Users\\Michal\\Downloads\\Arkusz kalkulacyjny bez tytułu - Copy (1).xlsx',
 'C:\\Users\\Michal\\Downloads\\Arkusz kalkulacyjny bez tytułu - Copy (10).xlsx',
 'C:\\Users\\Michal\\Downloads\\Arkusz kalkulacyjny bez tytułu - Copy (2).xlsx',
 'C:\\Users\\Michal\\Downloads\\Arkusz kalkulacyjny bez tytułu - Copy (3).xlsx',
 'C:\\Users\\Michal\\Downloads\\Arkusz kalkulacyjny bez tytułu - Copy (4).xlsx',
 'C:\\Users\\Michal\\Downloads\\Arkusz kalkulacyjny bez tytułu - Copy (5).xlsx',
 'C:\\Users\\Michal\\Downloads\\Arkusz kalkulacyjny bez tytułu - Copy (6).xlsx',
 'C:\\Users\\Michal\\Downloads\\Arkusz kalkulacyjny bez tytułu - Copy.xlsx',
 'C:\\Users\\Michal\\Downloads\\Arkusz kalkulacyjny bez tytułu.xlsx',
 'C:\\Users\\Michal\\Downloads\\Test11.xlsx',
 'C:\\Users\\Michal\\Downloads\\Test22.xlsx',
 'C:\\Users\\Michal\\Downloads\\Test33.xlsx']

In [5]:
inputs = os.path.abspath(data_path)
exl_files = get_file_list(inputs)
data = read_data(inputs)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:00<00:00, 102.63it/s]


In [7]:
data.tail()

Unnamed: 0,tekst,wartosc
19,65Lorem Ipsum is simply dummy text of the prin...,4
20,66Lorem Ipsum is simply dummy text of the prin...,5
21,67Lorem Ipsum is simply dummy text of the prin...,1
22,68Lorem Ipsum is simply dummy text of the prin...,2
23,69Lorem Ipsum is simply dummy text of the prin...,3


In [20]:
if filter_value:
    samples = get_samples(data, n_samples)
    filtered_data = samples.loc[samples["wartosc"] == filter_value]

In [21]:
len(data)

297

In [22]:
len(filtered_data)

26

In [38]:
filtered_data

Unnamed: 0,tekst,wartosc
20,66Lorem Ipsum is simply dummy text of the prin...,5
5,Jeden,5
16,Trzy,5
15,Siedem,5
15,pd.DataFrame(),5
5,pd.DataFrame(),5
4,Piec,5
10,56Lorem Ipsum is simply dummy text of the prin...,5
15,38Lorem Ipsum is simply dummy text of the prin...,5
17,pd.DataFrame(),5


In [24]:
filtered_data2.head()

Unnamed: 0,tekst
20,66Lorem Ipsum is simply dummy text of the prin...
5,Jeden
16,Trzy
15,Siedem
15,pd.DataFrame()


In [46]:
print(filtered_data.to_html(columns=['tekst'], index=False, border=0))

<table border="0" class="dataframe">
  <thead>
    <tr style="text-align: right;">
      <th>tekst</th>
    </tr>
  </thead>
  <tbody>
    <tr>
      <td>66Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum.</td>
    </tr>
    <tr>
      <td>Jeden</td>
    </tr>
    <tr>
      <td>Trzy</td>
    </tr>
    <tr>
      <td>Siedem</td>
    </tr>
    <tr>
      <td>pd.DataFrame()</td>
    </tr>
    <tr>
      <td>pd.DataFrame()</td>
    </tr>
    <tr>
      <td>Piec</td>
    </t