# Test notebook Meteorites

In [16]:
# Standard Library Imports
from pathlib import Path

# Installed packages
import pandas as pd
import numpy as np
import requests

# Testing
from IPython.utils.capture import capture_output
from IPython.display import display

# Our package
import pandas_profiling
from pandas_profiling.utils.cache import cache_file


In [17]:
file_name = cache_file(
    "meteorites.csv",
    "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD",
)
    
df = pd.read_csv(file_name)
    
# Note: Pandas does not support dates before 1880, so we ignore these for this analysis
df['year'] = pd.to_datetime(df['year'], errors='coerce')

# Example: Constant variable
df['source'] = "NASA"

# Example: Boolean variable
df['boolean'] = np.random.choice([True, False], df.shape[0])

# Example: Mixed with base types
df['mixed'] = np.random.choice([1, "A"], df.shape[0])

# Example: Highly correlated variables
df['reclat_city'] = df['reclat'] + np.random.normal(scale=5, size=(len(df)))

# Example: Duplicate observations
duplicates_to_add = pd.DataFrame(df.iloc[0:10])
duplicates_to_add[u'name'] = duplicates_to_add[u'name'] + " copy"  #...

df = df.append(duplicates_to_add, ignore_index=True)
#df.head()
duplicates_to_add.head()

Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation,source,boolean,mixed,reclat_city
0,Aachen copy,1,Valid,L5,21.0,Fell,1880-01-01,50.775,6.08333,"(50.775, 6.08333)",NASA,True,1,55.719338
1,Aarhus copy,2,Valid,H6,720.0,Fell,1951-01-01,56.18333,10.23333,"(56.18333, 10.23333)",NASA,False,1,54.285342
2,Abee copy,6,Valid,EH4,107000.0,Fell,1952-01-01,54.21667,-113.0,"(54.21667, -113.0)",NASA,True,1,48.227642
3,Acapulco copy,10,Valid,Acapulcoite,1914.0,Fell,1976-01-01,16.88333,-99.9,"(16.88333, -99.9)",NASA,False,A,21.106733
4,Achiras copy,370,Valid,L6,780.0,Fell,1902-01-01,-33.16667,-64.95,"(-33.16667, -64.95)",NASA,False,1,-31.456914


In [9]:
# Inline report without saving
with capture_output() as out:
    pr = df.profile_report(sort='None', html={'style':{'full_width': True}}, progress_bar=False, minimal=True)
    display(pr)

assert len(out.outputs) == 2
assert out.outputs[0].data['text/plain'] == '<IPython.core.display.HTML object>'
assert all(s in out.outputs[0].data['text/html'] for s in ['<iframe', 'Profile report generated with the `pandas-profiling`'])
assert out.outputs[1].data['text/plain'] == ''

In [10]:
# There should also 2 progress bars in minimal mode
with capture_output() as out:
    pfr = df.profile_report(html={'style': {'full_width': True}}, minimal=True, progress_bar=True,lazy=False)

assert all("FloatProgress" in s.data['text/plain'] for s in out.outputs)
assert len(out.outputs) == 2

In [11]:
# Write to a file
with capture_output() as out:
    pfr.to_file("/tmp/example.html")

assert all("FloatProgress" in s.data['text/plain'] for s in out.outputs)
assert len(out.outputs) == 2

In [12]:
# Print existing ProfileReport object inline
with capture_output() as out:
    display(pfr)
    
assert len(out.outputs) == 2
assert out.outputs[0].data['text/plain'] == '<IPython.core.display.HTML object>'
assert all(s in out.outputs[0].data['text/html'] for s in ['<iframe', 'Profile report generated with the `pandas-profiling`'])
assert out.outputs[1].data['text/plain'] == ''

In [24]:
prof = df.profile_report()
prof.to_widgets()
prof.to_file(output_file="output.html")

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=23.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render widgets', max=1.0, style=ProgressStyle(description…

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

HBox(children=(FloatProgress(value=0.0, description='Render HTML', max=1.0, style=ProgressStyle(description_wi…




HBox(children=(FloatProgress(value=0.0, description='Export report to file', max=1.0, style=ProgressStyle(desc…




In [19]:
pr = df.profile_report(sort='None', html={'style':{'full_width': True}}, progress_bar=False, minimal=True)
pr.to_file(output_file="output.html")



In [22]:
data = pd.DataFrame(
    np.random.rand(100,5),
    columns=["a","b","c","d","e"]
)
profile=data.profile_report()
profile.to_widgets()


HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=14.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render widgets', max=1.0, style=ProgressStyle(description…

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…