In [1]:
import pandas as pd
from pathlib import Path

In [2]:
#Load CSV data into Pandas using read_csv
csvpath = Path('stock_data.csv')
stock_data = pd.read_csv(csvpath)

In [3]:
#Identify the number of rows and columns (shape) in the DataFrame.
stock_data.shape

(504, 14)

In [5]:
#Generate a sample of the data to visually ensure data has been loaded in correctly.
stock_data.head()

Unnamed: 0,symbol,name,sector,price,price_per_earnings,dividend_yield,earnings_per_share,52_week_low,52_week_high,market_cap,ebitda,price_per_sales,price_per_book,sec_filings
0,MMM,3M Company,Industrials,$222.89,24.31,2.332862,$7.92,259.77,175.49,138721100000.0,9048000000.0,4.390271,11.34,http://www.sec.gov/cgi-bin/browse-edgar?action...
1,AOS,A.O. Smith Corp,Industrials,,,,,,,,,,,
2,ABT,Abbott Laboratories,Health Care,56.27,22.51,1.908982,0.26,64.6,42.28,102121000000.0,5744000000.0,3.74048,3.19,http://www.sec.gov/cgi-bin/browse-edgar?action...
3,ABBV,AbbVie Inc.,Health Care,108.48,19.41,2.49956,3.29,125.86,60.05,181386300000.0,10310000000.0,6.291571,26.14,http://www.sec.gov/cgi-bin/browse-edgar?action...
4,ATVI,Activision Blizzard,Information Technology,65.83,,0.431903,1.28,74.945,38.93,52518670000.0,2704000000.0,10.59512,5.16,http://www.sec.gov/cgi-bin/browse-edgar?action...


In [6]:
#Identify the number of records in the DataFrame, and 
#compare it with the number of rows in the original file. (from shape above)
stock_data.count()

symbol                504
name                  502
sector                501
price                 500
price_per_earnings    497
dividend_yield        499
earnings_per_share    498
52_week_low           500
52_week_high          500
market_cap            500
ebitda                492
price_per_sales       500
price_per_book        492
sec_filings           500
dtype: int64

In [9]:
#Identify nulls records
#stock_data.isnull().sum()
stock_data.isnull().mean() *100

symbol                0.000000
name                  0.396825
sector                0.595238
price                 0.793651
price_per_earnings    1.388889
dividend_yield        0.992063
earnings_per_share    1.190476
52_week_low           0.793651
52_week_high          0.793651
market_cap            0.793651
ebitda                2.380952
price_per_sales       0.793651
price_per_book        2.380952
sec_filings           0.793651
dtype: float64

In [13]:
#Drop Null Records
stock_data = stock_data.dropna()

In [14]:
#Validate all nulls have been dropped by calculating the sum of values that are null.
stock_data.isnull().sum()

symbol                0
name                  0
sector                0
price                 0
price_per_earnings    0
dividend_yield        0
earnings_per_share    0
52_week_low           0
52_week_high          0
market_cap            0
ebitda                0
price_per_sales       0
price_per_book        0
sec_filings           0
dtype: int64

In [15]:
#Default null ebitda values to 0.
stock_data['ebitda'] = stock_data['ebitda'].fillna(0)

In [17]:
#Check that there are no null ebitda values only using the sum function.
stock_data['ebitda'].isnull().sum()

0

In [19]:
#drop Duplicates
stock_data.drop_duplicates()

Unnamed: 0,symbol,name,sector,price,price_per_earnings,dividend_yield,earnings_per_share,52_week_low,52_week_high,market_cap,ebitda,price_per_sales,price_per_book,sec_filings
0,MMM,3M Company,Industrials,$222.89,24.31,2.332862,$7.92,259.77,175.490,1.387211e+11,9.048000e+09,4.390271,11.34,http://www.sec.gov/cgi-bin/browse-edgar?action...
2,ABT,Abbott Laboratories,Health Care,56.27,22.51,1.908982,0.26,64.60,42.280,1.021210e+11,5.744000e+09,3.740480,3.19,http://www.sec.gov/cgi-bin/browse-edgar?action...
3,ABBV,AbbVie Inc.,Health Care,108.48,19.41,2.499560,3.29,125.86,60.050,1.813863e+11,1.031000e+10,6.291571,26.14,http://www.sec.gov/cgi-bin/browse-edgar?action...
5,AYI,Acuity Brands Inc,Industrials,108.48,18.22,0.351185,7.43,225.36,142.000,6.242378e+09,5.878000e+08,1.795347,3.55,http://www.sec.gov/cgi-bin/browse-edgar?action...
6,ADBE,Adobe Systems Inc,Information Technology,185.16,52.31,0.000000,3.39,204.45,114.451,9.455021e+10,2.538040e+09,13.092818,11.06,http://www.sec.gov/cgi-bin/browse-edgar?action...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,XYL,Xylem Inc.,Industrials,70.24,30.94,1.170079,1.83,76.81,46.860,1.291502e+10,7.220000e+08,2.726209,5.31,http://www.sec.gov/cgi-bin/browse-edgar?action...
500,YUM,Yum! Brands Inc,Consumer Discretionary,76.3,27.25,1.797080,4.07,86.93,62.850,2.700330e+10,2.289000e+09,6.313636,212.08,http://www.sec.gov/cgi-bin/browse-edgar?action...
501,ZBH,Zimmer Biomet Holdings,Health Care,115.53,14.32,0.794834,9.01,133.49,108.170,2.445470e+10,2.007400e+09,3.164895,2.39,http://www.sec.gov/cgi-bin/browse-edgar?action...
502,ZION,Zions Bancorp,Financials,50.71,17.73,1.480933,2.6,55.61,38.430,1.067068e+10,0.000000e+00,3.794579,1.42,http://www.sec.gov/cgi-bin/browse-edgar?action...


In [20]:
#Now that nulls and duplicates have been wrangled, clean up the data a little more by 
#removing the $ currency symbols from the price field. 
#Then, use the astype function to cast price to a float.

stock_data['price'].head()

0    $222.89
2      56.27
3     108.48
5     108.48
6     185.16
Name: price, dtype: object

In [24]:
#stock_data['price'] = stock_data['price'].str.replace('$', ' ')
stock_data['price']

0       222.89
2        56.27
3       108.48
5       108.48
6       185.16
        ...   
499      70.24
500       76.3
501     115.53
502      50.71
503      71.51
Name: price, Length: 478, dtype: object

In [26]:
#check to see what the data type of price is
stock_data["price"].dtype

dtype('O')

In [27]:
#change the data type of price to float
stock_data["price"] = stock_data["price"].astype("float")

In [28]:
stock_data["price"].dtype

dtype('float64')