# Spring Cleaning!

Harold's stock data is a mess! Help him clean up his data before the auditors arrive!

In [35]:
# Import Libraries
import pandas as pd
from pathlib import Path

### Load CSV data into Pandas using `read_csv`

In [36]:
df = pd.read_csv("../../Resources/stock_data.csv")

### Identify the number of rows and columns (shape) in the DataFrame.

In [37]:
df.shape

(504, 14)

### Generate a sample of the data to visually ensure data has been loaded in correctly.

In [38]:
df.sample(5)

Unnamed: 0,symbol,name,sector,price,price_per_earnings,dividend_yield,earnings_per_share,52_week_low,52_week_high,market_cap,ebitda,price_per_sales,price_per_book,sec_filings
404,SCG,SCANA Corp,Utilities,35.6,8.75,6.683033,4.16,71.28,35.31,5229449000.0,1459000000.0,1.651705,0.92,http://www.sec.gov/cgi-bin/browse-edgar?action...
184,XOM,Exxon Mobil Corp.,Energy,76.07,21.37,4.00312,1.88,89.3,76.05,326148700000.0,39052000000.0,1.770194,1.85,http://www.sec.gov/cgi-bin/browse-edgar?action...
316,MNST,Monster Beverage,Consumer Staples,61.99,42.17,0.0,1.19,70.215,41.02,36403830000.0,1229478000.0,14.152587,9.56,http://www.sec.gov/cgi-bin/browse-edgar?action...
237,HUM,Humana Inc.,Health Care,262.37,22.39,0.604001,4.06,293.35,189.01,36973620000.0,0.0,0.92569,3.33,http://www.sec.gov/cgi-bin/browse-edgar?action...
2,ABT,Abbott Laboratories,Health Care,56.27,22.51,1.908982,0.26,64.6,42.28,102121000000.0,5744000000.0,3.74048,3.19,http://www.sec.gov/cgi-bin/browse-edgar?action...


### Identify the number of records in the DataFrame, and compare it with the number of rows in the original file.

In [39]:
df.shape[0]

504

### Identify nulls records

In [40]:
df.isna().sum()

symbol                 0
name                   2
sector                 3
price                  4
price_per_earnings     7
dividend_yield         5
earnings_per_share     6
52_week_low            4
52_week_high           4
market_cap             4
ebitda                12
price_per_sales        4
price_per_book        12
sec_filings            4
dtype: int64

### Drop Null Records

In [41]:
df = df.dropna(how='any')

### Validate nulls have been dropped

In [42]:
df.isna().sum().sum()

0

### Default null `ebitda` values to 0. Then, validate no records are null for ebitda.

In [47]:
# df['ebitda'].fillna(0, inplace=True) #duplicates are already removed

### Drop Duplicates

In [48]:
df = df.drop_duplicates()

In [49]:
df

Unnamed: 0,symbol,name,sector,price,price_per_earnings,dividend_yield,earnings_per_share,52_week_low,52_week_high,market_cap,ebitda,price_per_sales,price_per_book,sec_filings
0,MMM,3M Company,Industrials,$222.89,24.31,2.332862,$7.92,259.77,175.490,1.387211e+11,9.048000e+09,4.390271,11.34,http://www.sec.gov/cgi-bin/browse-edgar?action...
2,ABT,Abbott Laboratories,Health Care,56.27,22.51,1.908982,0.26,64.60,42.280,1.021210e+11,5.744000e+09,3.740480,3.19,http://www.sec.gov/cgi-bin/browse-edgar?action...
3,ABBV,AbbVie Inc.,Health Care,108.48,19.41,2.499560,3.29,125.86,60.050,1.813863e+11,1.031000e+10,6.291571,26.14,http://www.sec.gov/cgi-bin/browse-edgar?action...
5,AYI,Acuity Brands Inc,Industrials,108.48,18.22,0.351185,7.43,225.36,142.000,6.242378e+09,5.878000e+08,1.795347,3.55,http://www.sec.gov/cgi-bin/browse-edgar?action...
6,ADBE,Adobe Systems Inc,Information Technology,185.16,52.31,0.000000,3.39,204.45,114.451,9.455021e+10,2.538040e+09,13.092818,11.06,http://www.sec.gov/cgi-bin/browse-edgar?action...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499,XYL,Xylem Inc.,Industrials,70.24,30.94,1.170079,1.83,76.81,46.860,1.291502e+10,7.220000e+08,2.726209,5.31,http://www.sec.gov/cgi-bin/browse-edgar?action...
500,YUM,Yum! Brands Inc,Consumer Discretionary,76.3,27.25,1.797080,4.07,86.93,62.850,2.700330e+10,2.289000e+09,6.313636,212.08,http://www.sec.gov/cgi-bin/browse-edgar?action...
501,ZBH,Zimmer Biomet Holdings,Health Care,115.53,14.32,0.794834,9.01,133.49,108.170,2.445470e+10,2.007400e+09,3.164895,2.39,http://www.sec.gov/cgi-bin/browse-edgar?action...
502,ZION,Zions Bancorp,Financials,50.71,17.73,1.480933,2.6,55.61,38.430,1.067068e+10,0.000000e+00,3.794579,1.42,http://www.sec.gov/cgi-bin/browse-edgar?action...


In [50]:
df.shape[0]

478