# Spring Cleaning!

Harold's stock data is a mess! Help him clean up his data before the auditors arrive!

In [28]:
# Import Libraries
import pandas as pd
from pathlib import Path

### Load CSV data into Pandas using `read_csv`

In [29]:
csv_path = Path('stock_data.csv')
df = pd.read_csv(csv_path)

### Identify the number of rows and columns (shape) in the DataFrame.

In [30]:
df.shape

(504, 14)

### Generate a sample of the data to visually ensure data has been loaded in correctly.

In [31]:
df.sample(5)

Unnamed: 0,symbol,name,sector,price,price_per_earnings,dividend_yield,earnings_per_share,52_week_low,52_week_high,market_cap,ebitda,price_per_sales,price_per_book,sec_filings
271,KSS,Kohl's Corp.,Consumer Discretionary,60.34,16.01,3.496504,3.12,69.14,35.16,10570860000.0,2286000000.0,0.855776,2.04,http://www.sec.gov/cgi-bin/browse-edgar?action...
247,ICE,Intercontinental Exchange,Financials,67.0,22.95,5.429864,2.37,76.1378,56.8,41373050000.0,3103000000.0,9.619987,2.62,http://www.sec.gov/cgi-bin/browse-edgar?action...
287,LYB,LyondellBasell,Materials,105.79,10.35,3.264714,12.25,121.95,78.01,43556650000.0,6851000000.0,1.303761,5.86,http://www.sec.gov/cgi-bin/browse-edgar?action...
24,LNT,Alliant Energy Corp,Utilities,37.14,19.86,3.573333,1.65,45.55,36.84,8670164000.0,1168400000.0,3.433148,2.13,http://www.sec.gov/cgi-bin/browse-edgar?action...
58,ADSK,Autodesk Inc,Information Technology,104.81,-77.07,0.0,-2.61,131.1,81.75,24348290000.0,-378100000.0,16.50682,224.13,http://www.sec.gov/cgi-bin/browse-edgar?action...


### Identify the number of records in the DataFrame, and compare it with the number of rows in the original file.

In [32]:
len(df)

504

### Identify nulls records

In [33]:
df.isnull().sum()

symbol                 0
name                   2
sector                 3
price                  4
price_per_earnings     7
dividend_yield         5
earnings_per_share     6
52_week_low            4
52_week_high           4
market_cap             4
ebitda                12
price_per_sales        4
price_per_book        12
sec_filings            4
dtype: int64

### Drop Null Records

In [34]:
df.dropna(inplace=True)

### Validate nulls have been dropped

In [35]:
df.isnull().sum()

symbol                0
name                  0
sector                0
price                 0
price_per_earnings    0
dividend_yield        0
earnings_per_share    0
52_week_low           0
52_week_high          0
market_cap            0
ebitda                0
price_per_sales       0
price_per_book        0
sec_filings           0
dtype: int64

### Default null `ebitda` values to 0. Then, validate no records are null for ebitda.

In [37]:
df['ebitda'] = df['ebitda'].fillna(0)

### Drop Duplicates

In [None]:
df = df.drop_duplicates()