# Importing and Managing Financial Data in Python

In [2]:
import pandas as pd

In [2]:
amex = pd.read_csv('data/amex-listings.csv')

In [3]:
amex.info()  # to inspect table structure & data types

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Data columns (total 8 columns):
Stock Symbol             360 non-null object
Company Name             360 non-null object
Last Sale                346 non-null float64
Market Capitalization    360 non-null float64
IPO Year                 105 non-null float64
Sector                   238 non-null object
Industry                 238 non-null object
Last Update              360 non-null object
dtypes: float64(3), object(5)
memory usage: 22.6+ KB


### Deal with missing values

In [6]:
amex = pd.read_csv('data/amex-listings.csv', na_values = 'n/a')

In [7]:
amex.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Data columns (total 8 columns):
Stock Symbol             360 non-null object
Company Name             360 non-null object
Last Sale                346 non-null float64
Market Capitalization    360 non-null float64
IPO Year                 105 non-null float64
Sector                   238 non-null object
Industry                 238 non-null object
Last Update              360 non-null object
dtypes: float64(3), object(5)
memory usage: 22.6+ KB


### Properly parse date

In [8]:
amex = pd.read_csv('data/amex-listings.csv', na_values = 'n/a', parse_dates=['Last Update'])

In [10]:
amex.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Data columns (total 8 columns):
Stock Symbol             360 non-null object
Company Name             360 non-null object
Last Sale                346 non-null float64
Market Capitalization    360 non-null float64
IPO Year                 105 non-null float64
Sector                   238 non-null object
Industry                 238 non-null object
Last Update              360 non-null datetime64[ns]
dtypes: datetime64[ns](1), float64(3), object(4)
memory usage: 22.6+ KB


In [12]:
amex.head()

Unnamed: 0,Stock Symbol,Company Name,Last Sale,Market Capitalization,IPO Year,Sector,Industry,Last Update
0,XXII,"22nd Century Group, Inc",1.33,120628500.0,,Consumer Non-Durables,Farming/Seeds/Milling,2017-04-26
1,FAX,Aberdeen Asia-Pacific Income Fund Inc,5.0,1266333000.0,1986.0,,,2017-04-25
2,IAF,Aberdeen Australia Equity Fund Inc,6.15,139865300.0,,,,2017-04-23
3,CH,"Aberdeen Chile Fund, Inc.",7.2201,67563460.0,,,,2017-04-26
4,ABE,Aberdeen Emerging Markets Smaller Company Oppo...,13.36,128843000.0,,,,2017-04-25


### Import data from excel

In [3]:
amex = pd.read_excel('data/listings.xlsx', sheetname='amex', na_values='n/a')

  return func(*args, **kwargs)


In [4]:
amex.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 360 entries, 0 to 359
Data columns (total 7 columns):
Stock Symbol             360 non-null object
Company Name             360 non-null object
Last Sale                346 non-null float64
Market Capitalization    360 non-null float64
IPO Year                 105 non-null float64
Sector                   238 non-null object
Industry                 238 non-null object
dtypes: float64(3), object(4)
memory usage: 19.8+ KB


### Import data from two sheets

In [6]:
listings = pd.read_excel('data/listings.xlsx', sheet_name=['amex', 'nasdaq'], na_values='n/a')

### listings : dictionary
- Keys : sheetnames
- Values : DataFrame

In [7]:
listings['nasdaq'].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3167 entries, 0 to 3166
Data columns (total 7 columns):
Stock Symbol             3167 non-null object
Company Name             3167 non-null object
Last Sale                3165 non-null float64
Market Capitalization    3167 non-null float64
IPO Year                 1386 non-null float64
Sector                   2767 non-null object
Industry                 2767 non-null object
dtypes: float64(3), object(4)
memory usage: 173.3+ KB


### Get sheet names

In [8]:
xls = pd.ExcelFile('data/listings.xlsx')  # pd.ExcelFile object

In [10]:
exchanges = xls.sheet_names
print(exchanges)

['amex', 'nasdaq', 'nyse']


### Combine the data from multiple worksheets
### Combine data frames

In [12]:
amex = pd.read_excel('data/listings.xlsx', sheet_name='amex', na_values='n/a')

In [13]:
nyse = pd.read_excel('data/listings.xlsx', sheet_name='nyse', na_values='n/a')

In [14]:
pd.concat([amex, nyse]).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3507 entries, 0 to 3146
Data columns (total 7 columns):
Stock Symbol             3507 non-null object
Company Name             3507 non-null object
Last Sale                3425 non-null float64
Market Capitalization    3507 non-null float64
IPO Year                 1466 non-null float64
Sector                   2415 non-null object
Industry                 2415 non-null object
dtypes: float64(3), object(4)
memory usage: 219.2+ KB


### Add a reference column

In [15]:
amex['Exchange'] = 'AMEX' # add column to reference source
nyse['Exchange'] = 'NYSE'

### Automate combining dataframe

In [19]:
xls = pd.ExcelFile('data/listings.xlsx')
exchanges = xls.sheet_names
listings = [] # empty list to collect dataframes
for exchange in exchanges:
    listing = pd.read_excel(xls, sheet_name=exchange)
    listing['Exchange'] = exchange # add reference column
    listings.append(listing) # add dataframe to list

In [20]:
combined_listings = pd.concat(listings) # list of dataframes

In [21]:
combined_listings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6674 entries, 0 to 3146
Data columns (total 8 columns):
Stock Symbol             6674 non-null object
Company Name             6674 non-null object
Last Sale                6590 non-null float64
Market Capitalization    6674 non-null float64
IPO Year                 2852 non-null float64
Sector                   5182 non-null object
Industry                 5182 non-null object
Exchange                 6674 non-null object
dtypes: float64(3), object(5)
memory usage: 469.3+ KB
