In [4]:
import numpy as np
import pandas as pd

# Reading from CSV

## Ignoring header and assigning column names

In [5]:
df = pd.read_csv("c2_file.csv", header=None, names=["column 1", "column 2", "column 3", "column 4"])
df

Unnamed: 0,column 1,column 2,column 3,column 4
0,a,b,c,d
1,yellow,10,2,3.2
2,green,2,3,8.1
3,blue,7,1,0.4


## Checking data types

In [11]:
df = pd.read_csv("c2_file.csv")
df.dtypes

Unnamed: 0,a,b,c,d
0,yellow,10,2,3.2
1,green,2,3,8.1
2,blue,7,1,0.4


## Assigning datatypes

In [13]:
df2 = pd.read_csv("c2_file.csv", dtype={"b": np.float64})
df2.dtypes

a     object
b    float64
c      int64
d    float64
dtype: object

## Loading partial data

In [14]:
pd.read_csv("c2_file.csv", usecols=["a", "b"])

Unnamed: 0,a,b
0,yellow,10
1,green,2
2,blue,7


# Reading from Excel

In [6]:
pd.read_excel("c2_data.xls")

Unnamed: 0,varA,varB,varC
0,0.391723,-0.155122,0.381104
1,0.575125,-0.105817,0.232245
2,0.672305,0.424688,-0.694795
3,0.766115,-0.79135,-0.028739
4,0.677259,-0.817543,-0.537088
5,-0.029702,-0.891848,-0.682719
6,-0.161366,-0.6596,-0.727898
7,0.031672,0.016607,-0.940479
8,0.833212,-0.503236,-0.88721
9,0.907753,0.265177,-0.390762


In [7]:
pd.read_excel("c2_data.xls", sheet_name="Sheet2")

Unnamed: 0,varD,varE,varF
0,0.907753,0.265177,-0.390762
1,0.755019,-0.768056,-0.528307
2,0.850692,-0.537159,-0.601387
3,0.131663,0.941327,0.240073
4,0.5744,0.091735,-0.395277
5,0.81663,0.875612,-0.880044
6,0.536732,0.175428,-0.473053
7,-0.084641,-0.042827,0.053344
8,0.268271,-0.010628,-0.090952
9,0.166792,-0.872579,-0.556899


# Reading from JSON


In [11]:
import pandas as pd

pd.read_json("c2_frame.json")

Unnamed: 0,col1,col2,col3,col4
row1,0,1,2,3
row2,4,5,6,7
row3,8,9,10,11
row4,12,13,14,15


In [12]:
pd.read_json("c2_books.json")

Unnamed: 0,books
0,"{'isbn': '9781593275846', 'title': 'Eloquent J..."
1,"{'isbn': '9781449331818', 'title': 'Learning J..."
2,"{'isbn': '9781449365035', 'title': 'Speaking J..."


## Parse JSON file into columns

In [13]:
import json
from pandas import json_normalize

In [15]:
with open("c2_books.json", "r") as f:
    json_string = f.read()
    dictionary = json.loads(json_string)
    
json_normalize(dictionary, 'books')

Unnamed: 0,isbn,title,subtitle,author,published,publisher,pages,description,website
0,9781593275846,"Eloquent JavaScript, Second Edition",A Modern Introduction to Programming,Marijn Haverbeke,2014-12-14T00:00:00.000Z,No Starch Press,472,JavaScript lies at the heart of almost every m...,http://eloquentjavascript.net/
1,9781449331818,Learning JavaScript Design Patterns,A JavaScript and jQuery Developer's Guide,Addy Osmani,2012-07-01T00:00:00.000Z,O'Reilly Media,254,"With Learning JavaScript Design Patterns, you'...",http://www.addyosmani.com/resources/essentialj...
2,9781449365035,Speaking JavaScript,An In-Depth Guide for Programmers,Axel Rauschmayer,2014-02-01T00:00:00.000Z,O'Reilly Media,460,"Like it or not, JavaScript is everywhere these...",http://speakingjs.com/


# HTML files

In [4]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [2]:
page = requests.get(
    "https://web.archive.org/web/20180908144902/en.proverbia.net/shortfamousquotes.asp"
)

In [25]:
page.text[0:100]

'\n<!DOCTYPE html>\n\n<html lang="en" xml:lang="en">\n<head><script type="text/javascript" src="/_static/'

In [26]:
page.status_code

200

In [27]:
soup = BeautifulSoup(page.text, "html.parser")

In [33]:
# Find quotes in HTML

quotes = soup.find_all("blockquote")

quote_list = []
for quote in quotes:
    string = quote.text
    quote_list.append(string)
    
df = pd.DataFrame(quote_list, columns=["Quote"])
df

Unnamed: 0,Quote
0,There is a natural aristocracy among men. The ...
1,All our words from loose using have lost their...
2,"God couldn't be everywhere, so he created moth..."
3,"Be not afraid of going slowly, be afraid only ..."
4,"Learn from yesterday, live for today, hope for..."
5,Do not confine your children to your own learn...
6,"I hear and I forget, I see and I remember. I d..."
7,In teaching others we teach ourselves.
8,Happiness will never come to those who fail to...
9,"Without His love I can do nothing, with His lo..."


In [36]:
# Find authors in HTML

authors = soup.find_all("p", class_="a")

author_list = []
for author in authors:
    string = author.text[1:-1]
    author_list.append(string)
df["Author"] = author_list
df

Unnamed: 0,Quote,Author
0,There is a natural aristocracy among men. The ...,Thomas Jefferson (1743-1826) Third president o...
1,All our words from loose using have lost their...,Ernest Hemingway (1898-1961) American Writer.
2,"God couldn't be everywhere, so he created moth...",Jewish proverb
3,"Be not afraid of going slowly, be afraid only ...",Chinese proverb
4,"Learn from yesterday, live for today, hope for...",Unknown Source
5,Do not confine your children to your own learn...,Chinese proverb
6,"I hear and I forget, I see and I remember. I d...",Chinese proverb
7,In teaching others we teach ourselves.,Proverb
8,Happiness will never come to those who fail to...,Unknown Source
9,"Without His love I can do nothing, with His lo...",Unknown Source


## Scrapping of tables

## Read single table

In [15]:
tables = pd.read_html("https://world.openfoodfacts.org/additives")
print(len(tables))  # 1
print(tables[0].head())

1
                   Additive  Products   * Risk
0        E330 - Citric acid    154999 NaN  NaN
1          E322 - Lecithins    105226 NaN  NaN
2          E322i - Lecithin     93115 NaN  NaN
3  E500 - Sodium carbonates     65663 NaN  NaN
4        E415 - Xanthan gum     56828 NaN  NaN


## Read multiple tables

In [12]:
tables = pd.read_html(
    "https://en.wikipedia.org/wiki/World_record_progression_50_metres_freestyle"
)
print(len(tables))
print(tables[4].head())
print(tables[-2].head())

9
   Pos   Time                   Swimmer              Date        Venue   Ref
0    1  20.91         César Cielo (BRA)  18 December 2009    São Paulo   NaN
1    2  20.94  Frederick Bousquet (FRA)     26 April 2009  Montpellier   NaN
2    3  21.04      Caeleb Dressel (USA)      27 July 2019      Gwangju   NaN
3    3  21.04      Caeleb Dressel (USA)      20 June 2021        Omaha  [19]
4    4  21.11      Benjamin Proud (GBR)     8 August 2018      Glasgow   NaN
   Pos   Time                    Swimmer              Date         Venue   Ref
0    1  22.93  Ranomi Kromowidjojo (NED)     7 August 2017       Germany   NaN
1    2  23.00       Sarah Sjöström (SWE)     7 August 2017       Germany   NaN
2    3  23.04          Emma McKeon (AUS)  17 December 2022     Melbourne  [37]
3    4  23.10     Katarzyna Wasick (POL)   3 November 2022  Indianapolis  [38]
4    5  23.19        Cate Campbell (AUS)   27 October 2017        Russia   NaN


## Filter tables to be read

In [13]:
tables = pd.read_html(
    "https://en.wikipedia.org/wiki/World_record_progression_50_metres_freestyle",
    match="Switzerland",
)
print(len(tables))  # 1
print(tables[0][10:15][["Time", "Name", "Nationality"]])

1
     Time          Name    Nationality
10  22.54   Robin Leamy  United States
11  22.52  Dano Halsall    Switzerland
12  22.40     Tom Jager  United States
13  22.33   Matt Biondi  United States
14  22.33   Matt Biondi  United States
