## Pandas Profiling: NASA Meteorites example
Source of data: https://data.nasa.gov/Space-Science/Meteorite-Landings/gh4g-9sfh

### Import libraries

In [1]:
from pathlib import Path

import requests
import numpy as np
import pandas as pd

import pandas_profiling
#from pandas_profiling.utils.cache import cache_file
#import requests

In [2]:
url = "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD"

### Load and prepare example dataset
We add some fake variables for illustrating pandas-profiling capabilities

In [4]:

"""file_name = cache_file(
    "meteorites.csv",
    "https://data.nasa.gov/api/views/gh4g-9sfh/rows.csv?accessType=DOWNLOAD",
)"""
    
df = pd.read_csv(url)
    
# Note: Pandas does not support dates before 1880, so we ignore these for this analysis
df['year'] = pd.to_datetime(df['year'], errors='coerce')

# Example: Constant variable
df['source'] = "NASA"

# Example: Boolean variable
df['boolean'] = np.random.choice([True, False], df.shape[0])

# Example: Mixed with base types
df['mixed'] = np.random.choice([1, "A"], df.shape[0])

# Example: Highly correlated variables
df['reclat_city'] = df['reclat'] + np.random.normal(scale=5,size=(len(df)))

# Example: Duplicate observations
duplicates_to_add = pd.DataFrame(df.iloc[0:10])
duplicates_to_add[u'name'] = duplicates_to_add[u'name'] + " copy"

df = df.append(duplicates_to_add, ignore_index=True)

In [13]:
df

Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation,source,boolean,mixed,reclat_city
0,Aachen,1,Valid,L5,21.0,Fell,1880-01-01,50.77500,6.08333,"(50.775, 6.08333)",NASA,True,1,56.649104
1,Aarhus,2,Valid,H6,720.0,Fell,1951-01-01,56.18333,10.23333,"(56.18333, 10.23333)",NASA,False,A,64.695927
2,Abee,6,Valid,EH4,107000.0,Fell,1952-01-01,54.21667,-113.00000,"(54.21667, -113.0)",NASA,True,1,41.296535
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976-01-01,16.88333,-99.90000,"(16.88333, -99.9)",NASA,False,1,17.980718
4,Achiras,370,Valid,L6,780.0,Fell,1902-01-01,-33.16667,-64.95000,"(-33.16667, -64.95)",NASA,True,A,-31.949594
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45721,Adhi Kot copy,379,Valid,EH4,4239.0,Fell,1919-01-01,32.10000,71.80000,"(32.1, 71.8)",NASA,True,1,34.414417
45722,Adzhi-Bogdo (stone) copy,390,Valid,LL3-6,910.0,Fell,1949-01-01,44.83333,95.16667,"(44.83333, 95.16667)",NASA,False,A,42.771430
45723,Agen copy,392,Valid,H5,30000.0,Fell,1814-01-01,44.21667,0.61667,"(44.21667, 0.61667)",NASA,False,A,38.970772
45724,Aguada copy,398,Valid,L6,1620.0,Fell,1930-01-01,-31.60000,-65.23333,"(-31.6, -65.23333)",NASA,False,A,-33.064737


### Inline report without saving object

In [18]:
report = df.profile_report(sort='None', html={'style':{'full_width': True}}, progress_bar=False)
report

AttributeError: 'DataFrame' object has no attribute 'profile_report'

### Save report to file

In [None]:
profile_report = df.profile_report(html={'style': {'full_width': True}})
profile_report.to_file("/tmp/example.html")

### More analysis (Unicode) and Print existing ProfileReport object inline

In [None]:
profile_report = df.profile_report(explorative=True, html={'style': {'full_width': True}})
profile_report

### Notebook Widgets

In [None]:
profile_report.to_widgets()