# Using advanced options while reading data from csv files

## Import Module(s)

In [1]:
import pandas as pd

## The most basic read

In [2]:
df = pd.read_csv('IMDB.csv', encoding = "ISO-8859-1")
df.head()

Unnamed: 0,X,Title,Rating,TotalVotes,Genre1,Genre2,Genre3,MetaCritic,Budget,Runtime,...,Votes45A,Votes45AM,Votes45AF,VotesIMDB,Votes1000,VotesUS,VotesnUS,Domestic,Foreign,Worldwide
0,1,12 Years a SlaveÂ (2013),8.1,496092,Biography,Drama,History,96.0,20000000.0,134 min,...,7.8,7.8,8.1,8.0,7.7,8.3,8.0,"$56,671,993",131061209,"$187,733,202"
1,2,127 HoursÂ (2010),7.6,297075,Adventure,Biography,Drama,82.0,18000000.0,94 min,...,7.3,7.3,7.5,7.6,7.0,7.7,7.6,"$18,335,230",42403567,"$60,738,797"
2,3,50/50Â (2011),7.7,283935,Comedy,Drama,Romance,72.0,8000000.0,100 min,...,7.4,7.4,7.5,7.4,7.0,7.9,7.6,"$35,014,192",4173591,"$39,187,783"
3,4,About TimeÂ (2013),7.8,225412,Comedy,Drama,Fantasy,,12000000.0,123 min,...,7.6,7.5,7.8,7.7,6.9,7.8,7.7,"$15,322,921",71777528,"$87,100,449"
4,5,AmourÂ (2012),7.9,76121,Drama,Romance,,94.0,8900000.0,127 min,...,7.9,7.8,8.1,6.6,7.2,7.9,7.8,"$6,739,492",13100000,"$19,839,492"


## Advanced read options

## Manipulating Column & Index Locations and Names

### No header or column names

In [3]:
df = pd.read_csv('IMDB.csv', encoding = "ISO-8859-1", header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,X,Title,Rating,TotalVotes,Genre1,Genre2,Genre3,MetaCritic,Budget,Runtime,...,Votes45A,Votes45AM,Votes45AF,VotesIMDB,Votes1000,VotesUS,VotesnUS,Domestic,Foreign,Worldwide
1,1,12 Years a SlaveÂ (2013),8.1,496092,Biography,Drama,History,96,20000000,134 min,...,7.8,7.8,8.1,8,7.7,8.3,8,"$56,671,993",131061209,"$187,733,202"
2,2,127 HoursÂ (2010),7.6,297075,Adventure,Biography,Drama,82,18000000,94 min,...,7.3,7.3,7.5,7.6,7,7.7,7.6,"$18,335,230",42403567,"$60,738,797"
3,3,50/50Â (2011),7.7,283935,Comedy,Drama,Romance,72,8000000,100 min,...,7.4,7.4,7.5,7.4,7,7.9,7.6,"$35,014,192",4173591,"$39,187,783"
4,4,About TimeÂ (2013),7.8,225412,Comedy,Drama,Fantasy,,12000000,123 min,...,7.6,7.5,7.8,7.7,6.9,7.8,7.7,"$15,322,921",71777528,"$87,100,449"


### Specify a different row as header

In [4]:
df = pd.read_csv('IMDB.csv', encoding = "ISO-8859-1", header=2)
df.head()

Unnamed: 0,2,127 HoursÂ (2010),7.6,297075,Adventure,Biography,Drama,82,18000000,94 min,...,7.3,7.3.1,7.5.3,7.6.3,7,7.7.2,7.6.4,"$18,335,230",42403567,"$60,738,797"
0,3,50/50Â (2011),7.7,283935,Comedy,Drama,Romance,72.0,8000000.0,100 min,...,7.4,7.4,7.5,7.4,7.0,7.9,7.6,"$35,014,192",4173591,"$39,187,783"
1,4,About TimeÂ (2013),7.8,225412,Comedy,Drama,Fantasy,,12000000.0,123 min,...,7.6,7.5,7.8,7.7,6.9,7.8,7.7,"$15,322,921",71777528,"$87,100,449"
2,5,AmourÂ (2012),7.9,76121,Drama,Romance,,94.0,8900000.0,127 min,...,7.9,7.8,8.1,6.6,7.2,7.9,7.8,"$6,739,492",13100000,"$19,839,492"
3,6,ArgoÂ (2012),7.7,486840,Action,Biography,Drama,86.0,44500000.0,120 min,...,7.7,7.7,8.0,8.1,7.2,8.0,7.6,"$136,025,503",96300000,"$232,325,503"
4,7,ArrivalÂ (2016),8.0,370842,Drama,Mystery,Sci-Fi,81.0,47000000.0,116 min,...,7.6,7.6,7.7,8.3,7.3,8.0,7.9,"$100,546,139",102842047,"$203,388,186"


### Specify a column as index

In [5]:
df = pd.read_csv('IMDB.csv', encoding = "ISO-8859-1", index_col='Title')
df.head()

Unnamed: 0_level_0,X,Rating,TotalVotes,Genre1,Genre2,Genre3,MetaCritic,Budget,Runtime,CVotes10,...,Votes45A,Votes45AM,Votes45AF,VotesIMDB,Votes1000,VotesUS,VotesnUS,Domestic,Foreign,Worldwide
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12 Years a SlaveÂ (2013),1,8.1,496092,Biography,Drama,History,96.0,20000000.0,134 min,75556,...,7.8,7.8,8.1,8.0,7.7,8.3,8.0,"$56,671,993",131061209,"$187,733,202"
127 HoursÂ (2010),2,7.6,297075,Adventure,Biography,Drama,82.0,18000000.0,94 min,28939,...,7.3,7.3,7.5,7.6,7.0,7.7,7.6,"$18,335,230",42403567,"$60,738,797"
50/50Â (2011),3,7.7,283935,Comedy,Drama,Romance,72.0,8000000.0,100 min,28304,...,7.4,7.4,7.5,7.4,7.0,7.9,7.6,"$35,014,192",4173591,"$39,187,783"
About TimeÂ (2013),4,7.8,225412,Comedy,Drama,Fantasy,,12000000.0,123 min,38556,...,7.6,7.5,7.8,7.7,6.9,7.8,7.7,"$15,322,921",71777528,"$87,100,449"
AmourÂ (2012),5,7.9,76121,Drama,Romance,,94.0,8900000.0,127 min,11093,...,7.9,7.8,8.1,6.6,7.2,7.9,7.8,"$6,739,492",13100000,"$19,839,492"


### Choose only a subset of columns to be read

In [6]:
df = pd.read_csv('IMDB.csv', encoding = "ISO-8859-1", usecols=['Title', 'Genre1'])
df.head()

Unnamed: 0,Title,Genre1
0,12 Years a SlaveÂ (2013),Biography
1,127 HoursÂ (2010),Adventure
2,50/50Â (2011),Comedy
3,About TimeÂ (2013),Comedy
4,AmourÂ (2012),Drama


### Handling Missing and NA data

```
NaN: ”, ‘#N/A’, ‘#N/A N/A’, ‘#NA’, ‘-1.#IND’, ‘-1.#QNAN’, ‘-NaN’, ‘-nan’, ‘1.#IND’, ‘1.#QNAN’, ‘N/A’, ‘NA’, ‘NULL’, ‘NaN’, ‘nan’`.
```

In [7]:
df = pd.read_csv('IMDB.csv', encoding = "ISO-8859-1", na_values=[''])

### Choose whether to skip over blank rows or not

In [8]:
df = pd.read_csv('IMDB.csv', encoding = "ISO-8859-1", skip_blank_lines=False)

## Data Parsing options

### Skip rows

In [9]:
df = pd.read_csv('IMDB.csv', encoding = "ISO-8859-1", skiprows = [1,3,7])
df.head()

Unnamed: 0,X,Title,Rating,TotalVotes,Genre1,Genre2,Genre3,MetaCritic,Budget,Runtime,...,Votes45A,Votes45AM,Votes45AF,VotesIMDB,Votes1000,VotesUS,VotesnUS,Domestic,Foreign,Worldwide
0,2,127 HoursÂ (2010),7.6,297075,Adventure,Biography,Drama,82.0,18000000.0,94 min,...,7.3,7.3,7.5,7.6,7.0,7.7,7.6,"$18,335,230",42403567,"$60,738,797"
1,4,About TimeÂ (2013),7.8,225412,Comedy,Drama,Fantasy,,12000000.0,123 min,...,7.6,7.5,7.8,7.7,6.9,7.8,7.7,"$15,322,921",71777528,"$87,100,449"
2,5,AmourÂ (2012),7.9,76121,Drama,Romance,,94.0,8900000.0,127 min,...,7.9,7.8,8.1,6.6,7.2,7.9,7.8,"$6,739,492",13100000,"$19,839,492"
3,6,ArgoÂ (2012),7.7,486840,Action,Biography,Drama,86.0,44500000.0,120 min,...,7.7,7.7,8.0,8.1,7.2,8.0,7.6,"$136,025,503",96300000,"$232,325,503"
4,9,Before MidnightÂ (2013),7.9,106553,Drama,Romance,,94.0,3000000.0,109 min,...,7.3,7.4,7.2,8.5,7.0,8.0,7.9,"$8,114,627",3061842,"$11,176,469"


### Skip rows from footer or from end of the file

In [10]:
df.tail(2)
df = pd.read_csv('IMDB.csv', encoding = "ISO-8859-1", skipfooter=2, engine='python')
df.tail(2)

Unnamed: 0,X,Title,Rating,TotalVotes,Genre1,Genre2,Genre3,MetaCritic,Budget,Runtime,...,Votes45A,Votes45AM,Votes45AF,VotesIMDB,Votes1000,VotesUS,VotesnUS,Domestic,Foreign,Worldwide
113,115,Wreck-It RalphÂ (2012),7.7,295125,Animation,Adventure,Comedy,72.0,165000000.0,,...,7.4,7.4,7.5,7.4,7.2,7.9,7.6,"$189,422,889",281800000,"$471,222,889"
114,116,X-Men: Days of Future PastÂ (2014),8.0,560736,Action,Adventure,Sci-Fi,74.0,200000000.0,132 min,...,7.7,7.7,7.9,7.5,7.4,8.1,7.9,"$233,921,534",513941241,"$747,862,775"


### Reading only a subset of the file or a certain number of rows

In [11]:
df = pd.read_csv('IMDB.csv', encoding = "ISO-8859-1", nrows=100)
df.shape

(100, 58)

# Reading data from excel files

## Basic Excel read 

In [12]:
df = pd.read_excel('IMDB.xlsx')
df.head()

Unnamed: 0,X,Title,Rating,TotalVotes,Genre1,Genre2,Genre3,MetaCritic,Budget,Runtime,...,Votes45A,Votes45AM,Votes45AF,VotesIMDB,Votes1000,VotesUS,VotesnUS,Domestic,Foreign,Worldwide
0,1,12 Years a Slave (2013),8.1,496092,Biography,Drama,History,96.0,20000000.0,134 min,...,7.8,7.8,8.1,8.0,7.7,8.3,8.0,"$56,671,993",131061209,"$187,733,202"
1,2,127 Hours (2010),7.6,297075,Adventure,Biography,Drama,82.0,18000000.0,94 min,...,7.3,7.3,7.5,7.6,7.0,7.7,7.6,"$18,335,230",42403567,"$60,738,797"
2,3,50/50 (2011),7.7,283935,Comedy,Drama,Romance,72.0,8000000.0,100 min,...,7.4,7.4,7.5,7.4,7.0,7.9,7.6,"$35,014,192",4173591,"$39,187,783"
3,4,About Time (2013),7.8,225412,Comedy,Drama,Fantasy,,12000000.0,123 min,...,7.6,7.5,7.8,7.7,6.9,7.8,7.7,"$15,322,921",71777528,"$87,100,449"
4,5,Amour (2012),7.9,76121,Drama,Romance,,94.0,8900000.0,127 min,...,7.9,7.8,8.1,6.6,7.2,7.9,7.8,"$6,739,492",13100000,"$19,839,492"


## Advanced read options

```
pandas.read_excel(io, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None, names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, convert_float=True, has_index_names=None, converters=None, dtype=None, true_values=None, false_values=None, engine=None, squeeze=False, **kwds)
```

**from [Pandas Doc](http://pandas.pydata.org/pandas-docs/version/0.20/generated/pandas.read_excel.html)

### Which Sheet to read?

In [13]:
df = pd.read_excel('IMDB.xlsx', sheet_name=0)
df.head()

Unnamed: 0,X,Title,Rating,TotalVotes,Genre1,Genre2,Genre3,MetaCritic,Budget,Runtime,...,Votes45A,Votes45AM,Votes45AF,VotesIMDB,Votes1000,VotesUS,VotesnUS,Domestic,Foreign,Worldwide
0,1,12 Years a Slave (2013),8.1,496092,Biography,Drama,History,96.0,20000000.0,134 min,...,7.8,7.8,8.1,8.0,7.7,8.3,8.0,"$56,671,993",131061209,"$187,733,202"
1,2,127 Hours (2010),7.6,297075,Adventure,Biography,Drama,82.0,18000000.0,94 min,...,7.3,7.3,7.5,7.6,7.0,7.7,7.6,"$18,335,230",42403567,"$60,738,797"
2,3,50/50 (2011),7.7,283935,Comedy,Drama,Romance,72.0,8000000.0,100 min,...,7.4,7.4,7.5,7.4,7.0,7.9,7.6,"$35,014,192",4173591,"$39,187,783"
3,4,About Time (2013),7.8,225412,Comedy,Drama,Fantasy,,12000000.0,123 min,...,7.6,7.5,7.8,7.7,6.9,7.8,7.7,"$15,322,921",71777528,"$87,100,449"
4,5,Amour (2012),7.9,76121,Drama,Romance,,94.0,8900000.0,127 min,...,7.9,7.8,8.1,6.6,7.2,7.9,7.8,"$6,739,492",13100000,"$19,839,492"


## Reading data from multiple sheets in an excel file

## Find out the sheet list of the excel file

In [14]:
xls_file = pd.ExcelFile('IMDB.xlsx')
xls_file.sheet_names

['movies', 'by genre']

In [15]:
df1 = xls_file.parse('movies')
df2 = xls_file.parse('by genre')
df1.head()
df2.head()

Unnamed: 0,X,Title,Rating,TotalVotes,Genre1,Genre2,Genre3,MetaCritic,Budget,Runtime,...,Votes45A,Votes45AM,Votes45AF,VotesIMDB,Votes1000,VotesUS,VotesnUS,Domestic,Foreign,Worldwide
0,1,12 Years a Slave (2013),8.1,496092,Biography,Drama,History,96.0,20000000.0,134 min,...,7.8,7.8,8.1,8.0,7.7,8.3,8.0,"$56,671,993",131061209,"$187,733,202"
1,2,127 Hours (2010),7.6,297075,Adventure,Biography,Drama,82.0,18000000.0,94 min,...,7.3,7.3,7.5,7.6,7.0,7.7,7.6,"$18,335,230",42403567,"$60,738,797"
2,3,50/50 (2011),7.7,283935,Comedy,Drama,Romance,72.0,8000000.0,100 min,...,7.4,7.4,7.5,7.4,7.0,7.9,7.6,"$35,014,192",4173591,"$39,187,783"
3,4,About Time (2013),7.8,225412,Comedy,Drama,Fantasy,,12000000.0,123 min,...,7.6,7.5,7.8,7.7,6.9,7.8,7.7,"$15,322,921",71777528,"$87,100,449"
4,5,Amour (2012),7.9,76121,Drama,Romance,,94.0,8900000.0,127 min,...,7.9,7.8,8.1,6.6,7.2,7.9,7.8,"$6,739,492",13100000,"$19,839,492"


## Choose Header or column labels

In [16]:
df = pd.read_excel('IMDB.xlsx', sheet_name=1, header=3)
df.head()

Unnamed: 0,3,50/50 (2011),7.7,283935,Comedy,Drama,Romance,72,8000000,100 min,...,7.4,7.4.1,7.5,7.4.2,7,7.9.3,7.6.3,"$35,014,192",4173591,"$39,187,783"
0,4,About Time (2013),7.8,225412,Comedy,Drama,Fantasy,,12000000.0,123 min,...,7.6,7.5,7.8,7.7,6.9,7.8,7.7,"$15,322,921",71777528,"$87,100,449"
1,5,Amour (2012),7.9,76121,Drama,Romance,,94.0,8900000.0,127 min,...,7.9,7.8,8.1,6.6,7.2,7.9,7.8,"$6,739,492",13100000,"$19,839,492"
2,6,Argo (2012),7.7,486840,Action,Biography,Drama,86.0,44500000.0,120 min,...,7.7,7.7,8.0,8.1,7.2,8.0,7.6,"$136,025,503",96300000,"$232,325,503"
3,7,Arrival (2016),8.0,370842,Drama,Mystery,Sci-Fi,81.0,47000000.0,116 min,...,7.6,7.6,7.7,8.3,7.3,8.0,7.9,"$100,546,139",102842047,"$203,388,186"
4,9,Before Midnight (2013),7.9,106553,Drama,Romance,,94.0,3000000.0,109 min,...,7.3,7.4,7.2,8.5,7.0,8.0,7.9,"$8,114,627",3061842,"$11,176,469"


## No header

In [17]:
df = pd.read_excel('IMDB.xlsx', sheet_name=1, header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,X,Title,Rating,TotalVotes,Genre1,Genre2,Genre3,MetaCritic,Budget,Runtime,...,Votes45A,Votes45AM,Votes45AF,VotesIMDB,Votes1000,VotesUS,VotesnUS,Domestic,Foreign,Worldwide
1,1,12 Years a Slave (2013),8.1,496092,Biography,Drama,History,96,20000000,134 min,...,7.8,7.8,8.1,8,7.7,8.3,8,"$56,671,993",131061209,"$187,733,202"
2,2,127 Hours (2010),7.6,297075,Adventure,Biography,Drama,82,18000000,94 min,...,7.3,7.3,7.5,7.6,7,7.7,7.6,"$18,335,230",42403567,"$60,738,797"
3,3,50/50 (2011),7.7,283935,Comedy,Drama,Romance,72,8000000,100 min,...,7.4,7.4,7.5,7.4,7,7.9,7.6,"$35,014,192",4173591,"$39,187,783"
4,4,About Time (2013),7.8,225412,Comedy,Drama,Fantasy,,12000000,123 min,...,7.6,7.5,7.8,7.7,6.9,7.8,7.7,"$15,322,921",71777528,"$87,100,449"


## Skip Rows at the beginning of the file

In [18]:
df = pd.read_excel('IMDB.xlsx', sheet_name=1, skiprows=7)

## Skip rows from the end of the file

In [19]:
df = pd.read_excel('IMDB.xlsx', sheet_name=1, skipfooter=10)

## Choose Columns

In [20]:
df = pd.read_excel('IMDB.xlsx', sheet_name= 0, usecols=[0,1])
df.head()

Unnamed: 0,X,Title
0,1,12 Years a Slave (2013)
1,2,127 Hours (2010)
2,3,50/50 (2011)
3,4,About Time (2013)
4,5,Amour (2012)


## Column Names

In [21]:
df = pd.read_excel('IMDB.xlsx', sheet_name=0, usecols=[0,1,2], names=['X','Title', 'Rating'], )
df.head()

Unnamed: 0,X,Title,Rating
0,1,12 Years a Slave (2013),8.1
1,2,127 Hours (2010),7.6
2,3,50/50 (2011),7.7
3,4,About Time (2013),7.8
4,5,Amour (2012),7.9


## Set an Index while reading data

In [22]:
df = pd.read_excel('IMDB.xlsx', sheet_name=0, index_col='Title')
df.head()

Unnamed: 0_level_0,X,Rating,TotalVotes,Genre1,Genre2,Genre3,MetaCritic,Budget,Runtime,CVotes10,...,Votes45A,Votes45AM,Votes45AF,VotesIMDB,Votes1000,VotesUS,VotesnUS,Domestic,Foreign,Worldwide
Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12 Years a Slave (2013),1,8.1,496092,Biography,Drama,History,96.0,20000000.0,134 min,75556,...,7.8,7.8,8.1,8.0,7.7,8.3,8.0,"$56,671,993",131061209,"$187,733,202"
127 Hours (2010),2,7.6,297075,Adventure,Biography,Drama,82.0,18000000.0,94 min,28939,...,7.3,7.3,7.5,7.6,7.0,7.7,7.6,"$18,335,230",42403567,"$60,738,797"
50/50 (2011),3,7.7,283935,Comedy,Drama,Romance,72.0,8000000.0,100 min,28304,...,7.4,7.4,7.5,7.4,7.0,7.9,7.6,"$35,014,192",4173591,"$39,187,783"
About Time (2013),4,7.8,225412,Comedy,Drama,Fantasy,,12000000.0,123 min,38556,...,7.6,7.5,7.8,7.7,6.9,7.8,7.7,"$15,322,921",71777528,"$87,100,449"
Amour (2012),5,7.9,76121,Drama,Romance,,94.0,8900000.0,127 min,11093,...,7.9,7.8,8.1,6.6,7.2,7.9,7.8,"$6,739,492",13100000,"$19,839,492"


## Handle missing data while reading

In [23]:
df = pd.read_excel('IMDB.xlsx', sheet_name= 0, na_values=[' '])

# Reading data from some other popular formats

## Reading JSON data into Pandas

In [24]:
movies_json = pd.read_json('IMDB.json')
movies_json.head()

Unnamed: 0,X,Title,Rating,TotalVotes,Genre1,Genre2,Genre3,MetaCritic,Budget,Runtime,...,Votes45A,Votes45AM,Votes45AF,VotesIMDB,Votes1000,VotesUS,VotesnUS,Domestic,Foreign,Worldwide
0,1,12 Years a Slave�(2013),8.1,496092,Biography,Drama,History,96.0,20000000,134 min,...,7.8,7.8,8.1,8.0,7.7,8.3,8.0,"$56,671,993",131061209,"$187,733,202"
1,2,127 Hours�(2010),7.6,297075,Adventure,Biography,Drama,82.0,18000000,94 min,...,7.3,7.3,7.5,7.6,7.0,7.7,7.6,"$18,335,230",42403567,"$60,738,797"
2,3,50/50�(2011),7.7,283935,Comedy,Drama,Romance,72.0,8000000,100 min,...,7.4,7.4,7.5,7.4,7.0,7.9,7.6,"$35,014,192",4173591,"$39,187,783"
3,4,About Time�(2013),7.8,225412,Comedy,Drama,Fantasy,,12000000,123 min,...,7.6,7.5,7.8,7.7,6.9,7.8,7.7,"$15,322,921",71777528,"$87,100,449"
4,5,Amour�(2012),7.9,76121,Drama,Romance,,94.0,8900000,127 min,...,7.9,7.8,8.1,6.6,7.2,7.9,7.8,"$6,739,492",13100000,"$19,839,492"


## Reading HTML data

In [25]:
pd.read_html('IMDB.html')

[      0                                  1       2           3       4   \
 0      X                              Title  Rating  TotalVotes  Genre1   
 1      1             12 Years a Slave(2013)     8.1         496     092   
 2      2                   127 Hours (2010)     7.6         297     075   
 3      3                       50/50 (2011)     7.7         283     935   
 4      4                  About Time (2013)     7.8         225     412   
 ..   ...                                ...     ...         ...     ...   
 113  114                    Whiplash (2014)     8.5         492     285   
 114  115              Wreck-It Ralph (2012)     7.7         295     125   
 115  116  X-Men: Days of Future Past (2014)       8         560     736   
 116  117          X-Men: First Class (2011)     7.8         556     713   
 117  118                    Zootopia (2016)     8.1         309     474   
 
             5          6           7       8        9   ...        48  \
 0       Gen

## Read a pickle file

In [26]:
df = pd.read_pickle('IMDB.p')
df.head()

Unnamed: 0,X,Title,Rating,TotalVotes,Genre1,Genre2,Genre3,MetaCritic,Budget,Runtime,...,Votes45A,Votes45AM,Votes45AF,VotesIMDB,Votes1000,VotesUS,VotesnUS,Domestic,Foreign,Worldwide
0,1,12 Years a SlaveÂ (2013),8.1,496092,Biography,Drama,History,96.0,20000000.0,134 min,...,7.8,7.8,8.1,8.0,7.7,8.3,8.0,"$56,671,993",131061209,"$187,733,202"
1,2,127 HoursÂ (2010),7.6,297075,Adventure,Biography,Drama,82.0,18000000.0,94 min,...,7.3,7.3,7.5,7.6,7.0,7.7,7.6,"$18,335,230",42403567,"$60,738,797"
2,3,50/50Â (2011),7.7,283935,Comedy,Drama,Romance,72.0,8000000.0,100 min,...,7.4,7.4,7.5,7.4,7.0,7.9,7.6,"$35,014,192",4173591,"$39,187,783"
3,4,About TimeÂ (2013),7.8,225412,Comedy,Drama,Fantasy,,12000000.0,123 min,...,7.6,7.5,7.8,7.7,6.9,7.8,7.7,"$15,322,921",71777528,"$87,100,449"
4,5,AmourÂ (2012),7.9,76121,Drama,Romance,,94.0,8900000.0,127 min,...,7.9,7.8,8.1,6.6,7.2,7.9,7.8,"$6,739,492",13100000,"$19,839,492"


## Read SQL data

In [27]:
import sqlite3

In [28]:
conn = sqlite3.connect("IMDB.sqlite")
df = pd.read_sql_query("SELECT * FROM IMDB;", conn)
df.head()

Unnamed: 0,X,Title,Rating,TotalVotes,Genre1,Genre2,Genre3,MetaCritic,Budget,Runtime,...,Votes45A,Votes45AM,Votes45AF,VotesIMDB,Votes1000,VotesUS,VotesnUS,Domestic,Foreign,Worldwide
0,1,12 Years a Slave (2013),8.1,496092,Biography,Drama,History,96.0,20000000,134 min,...,7.8,7.8,8.1,8.0,7.7,8.3,8.0,"$56,671,993",131061209,"$187,733,202"
1,2,127 Hours (2010),7.6,297075,Adventure,Biography,Drama,82.0,18000000,94 min,...,7.3,7.3,7.5,7.6,7.0,7.7,7.6,"$18,335,230",42403567,"$60,738,797"
2,3,50/50 (2011),7.7,283935,Comedy,Drama,Romance,72.0,8000000,100 min,...,7.4,7.4,7.5,7.4,7.0,7.9,7.6,"$35,014,192",4173591,"$39,187,783"
3,4,About Time (2013),7.8,225412,Comedy,Drama,Fantasy,,12000000,123 min,...,7.6,7.5,7.8,7.7,6.9,7.8,7.7,"$15,322,921",71777528,"$87,100,449"
4,5,Amour (2012),7.9,76121,Drama,Romance,,94.0,8900000,127 min,...,7.9,7.8,8.1,6.6,7.2,7.9,7.8,"$6,739,492",13100000,"$19,839,492"


## Read data from clipboard

In [30]:
df = pd.read_clipboard()
df.head()

Unnamed: 0,X,Title,Rating,TotalVotes,Genre1,Genre2,Genre3,MetaCritic,Budget,Runtime,...,Votes45A,Votes45AM,Votes45AF,VotesIMDB,Votes1000,VotesUS,VotesnUS,Domestic,Foreign,Worldwide
0,1,12 Years a SlaveÂ (2013),8.1,496092,Biography,Drama,History,96.0,20000000.0,134 min,...,7.8,7.8,8.1,8.0,7.7,8.3,8.0,"$56,671,993",131061209,"$187,733,202"
1,2,127 HoursÂ (2010),7.6,297075,Adventure,Biography,Drama,82.0,18000000.0,94 min,...,7.3,7.3,7.5,7.6,7.0,7.7,7.6,"$18,335,230",42403567,"$60,738,797"
2,3,50/50Â (2011),7.7,283935,Comedy,Drama,Romance,72.0,8000000.0,100 min,...,7.4,7.4,7.5,7.4,7.0,7.9,7.6,"$35,014,192",4173591,"$39,187,783"
3,4,About TimeÂ (2013),7.8,225412,Comedy,Drama,Fantasy,,12000000.0,123 min,...,7.6,7.5,7.8,7.7,6.9,7.8,7.7,"$15,322,921",71777528,"$87,100,449"
4,5,AmourÂ (2012),7.9,76121,Drama,Romance,,94.0,8900000.0,127 min,...,7.9,7.8,8.1,6.6,7.2,7.9,7.8,"$6,739,492",13100000,"$19,839,492"
