In [6]:
import numpy as np
import pandas as pd

In [7]:
def between_delimiter():
    print()
    print('----------------------------------------------------------------------------------------------')
    print()

In [8]:
def eda(data_path):
    pd.set_option('display.max_colwidth', None)
    pd.set_option('display.max_rows', None)

    data = pd.read_csv(data_path)
    
    new_cols = [col.strip().replace(' ','_').lower() for col in data.columns]
    data.rename(columns={key:val for (key,val) in zip(data.columns,new_cols)},inplace=True)
    
    print('Sample of the data:')
    display(data.head(10))
    between_delimiter()
    
    print('Data columns types:')
    display(data.dtypes)
    types = np.array(data.dtypes)
    ints = [x for x in types if (x=='int64' or x=='int32')]
    floats = [x for x in types if (x=='float64' or x=='float32')]
    bools = [x for x in types if x=='bool']
    objects = [x for x in types if (x=='O' or x=='object')]
    strings = [x for x in types if x=='str']
    datetimes = [x for x in types if x=='datetime64']

    between_delimiter()
    if len(ints)+len(floats)>0:
        print('Quantitative variables:')
        display(data.describe())
        between_delimiter()
    if len(objects)>0:
        print('Object variables:')
        display(data.describe(include = [object]))
        for col in data.columns:
            if data[col].dtype in objects:
                print("- '"+str(col)+"' column values:")
                print("  ",data[col].unique()[:10])
                if data[col].nunique()>10:
                    print('  .\n  .\n  .')
                print()
        between_delimiter()
    if len(datetimes)>0:
        print('Datetime variables:')
        display(data.describe(datetime_is_numeric=True))
        between_delimiter()
        
    print('Missing data:')
    missing_counter = 0
    for col in data.columns:
        df = data[data[col].isnull()==True]
        if len(df)>0:
            print("- '"+str(col)+"' column:")
            display(df)
            missing_counter += 1
    if missing_counter == 0:
        print('  None')
#     between_delimiter()

In [9]:
eda('test.csv')

Sample of the data:


Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552
5,31,female,25.74,0,no,southeast,3756.6216
6,46,female,33.44,1,no,southeast,8240.5896
7,37,female,27.74,3,no,northwest,7281.5056
8,37,male,29.83,2,no,northeast,6406.4107
9,60,female,25.84,0,no,northwest,28923.13692



----------------------------------------------------------------------------------------------

Data columns types:


age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object


----------------------------------------------------------------------------------------------

Quantitative variables:


Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801



----------------------------------------------------------------------------------------------

Object variables:


Unnamed: 0,sex,smoker,region
count,1338,1338,1338
unique,2,2,4
top,male,no,southeast
freq,676,1064,364


- 'sex' column values:
   ['female' 'male']

- 'smoker' column values:
   ['yes' 'no']

- 'region' column values:
   ['southwest' 'southeast' 'northwest' 'northeast']


----------------------------------------------------------------------------------------------

Missing data:
  None


In [10]:
eda('test2.csv')

Sample of the data:


Unnamed: 0,show_number,air_date,round,category,value,question,answer
0,4680,2004-12-31,Jeopardy!,HISTORY,$200,"For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory",Copernicus
1,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$200,"No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves",Jim Thorpe
2,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$200,"The city of Yuma in this state has a record average of 4,055 hours of sunshine each year",Arizona
3,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$200,"In 1963, live on ""The Art Linkletter Show"", this company served its billionth burger",McDonald's
4,4680,2004-12-31,Jeopardy!,EPITAPHS & TRIBUTES,$200,"Signer of the Dec. of Indep., framer of the Constitution of Mass., second President of the United States",John Adams
5,4680,2004-12-31,Jeopardy!,3-LETTER WORDS,$200,"In the title of an Aesop fable, this insect shared billing with a grasshopper",the ant
6,4680,2004-12-31,Jeopardy!,HISTORY,$400,"Built in 312 B.C. to link Rome & the South of Italy, it's still in use today",the Appian Way
7,4680,2004-12-31,Jeopardy!,ESPN's TOP 10 ALL-TIME ATHLETES,$400,"No. 8: 30 steals for the Birmingham Barons; 2,306 steals for the Bulls",Michael Jordan
8,4680,2004-12-31,Jeopardy!,EVERYBODY TALKS ABOUT IT...,$400,"In the winter of 1971-72, a record 1,122 inches of snow fell at Rainier Paradise Ranger Station in this state",Washington
9,4680,2004-12-31,Jeopardy!,THE COMPANY LINE,$400,This housewares store was named for the packaging its merchandise came in & was first displayed on,Crate & Barrel



----------------------------------------------------------------------------------------------

Data columns types:


show_number     int64
air_date       object
round          object
category       object
value          object
question       object
answer         object
dtype: object


----------------------------------------------------------------------------------------------

Quantitative variables:


Unnamed: 0,show_number
count,216930.0
mean,4264.238519
std,1386.296335
min,1.0
25%,3349.0
50%,4490.0
75%,5393.0
max,6300.0



----------------------------------------------------------------------------------------------

Object variables:


Unnamed: 0,air_date,round,category,value,question,answer
count,216930,216930,216930,216930,216930,216928
unique,3640,4,27995,150,216124,88268
top,1997-05-19,Jeopardy!,BEFORE & AFTER,$400,[audio clue],China
freq,62,107384,547,42244,17,216


- 'air_date' column values:
   ['2004-12-31' '2010-07-06' '2000-12-18' '2000-07-19' '2006-02-06'
 '2009-05-08' '1996-12-06' '2010-12-07' '2007-05-30' '1997-11-10']
  .
  .
  .

- 'round' column values:
   ['Jeopardy!' 'Double Jeopardy!' 'Final Jeopardy!' 'Tiebreaker']

- 'category' column values:
   ['HISTORY' "ESPN's TOP 10 ALL-TIME ATHLETES" 'EVERYBODY TALKS ABOUT IT...'
 'THE COMPANY LINE' 'EPITAPHS & TRIBUTES' '3-LETTER WORDS'
 'DR. SEUSS AT THE MULTIPLEX' 'PRESIDENTIAL STATES OF BIRTH'
 'AIRLINE TRAVEL' 'THAT OLD-TIME RELIGION']
  .
  .
  .

- 'value' column values:
   ['$200' '$400' '$600' '$800' '$2,000' '$1000' '$1200' '$1600' '$2000'
 '$3,200']
  .
  .
  .

- 'question' column values:
   ["For the last 8 years of his life, Galileo was under house arrest for espousing this man's theory"
 'No. 2: 1912 Olympian; football star at Carlisle Indian School; 6 MLB seasons with the Reds, Giants & Braves'
 'The city of Yuma in this state has a record average of 4,055 hours of sunshine ea

Unnamed: 0,show_number,air_date,round,category,value,question,answer
94817,4346,2003-06-23,Jeopardy!,"GOING ""N""SANE",$200,"It often precedes ""and void""",
143297,6177,2011-06-21,Double Jeopardy!,NOTHING,$400,"This word for ""nothing"" precedes ""and void"" to mean ""not valid""",
