## (Part 9) - Cleaning Data - Casting Datatypes and Handling Missing Values

#### 

In [None]:
import pandas as pd
import numpy as np

In [None]:
people = {
    'first': ['Corey', 'Jane', 'John', 'Chris', np.nan, None, 'NA'],
    'last': ['Schafer', 'Doe', 'Doe', 'Schafer', np.nan, np.nan, 'Missing'],
    'email': ['CoreyMSchafer@gmail.com', 'JaneDoe@email.com', 'JohnDoe@email.com', None, np.nan, 'Anonymous@email.com', 'NA'],
    'age': ['33', '55', '63', '36', None, None, 'Missing']
}


In [None]:
df = pd.DataFrame(people)
df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)

In [None]:
df

In [None]:
df.dropna()

In [None]:
df.dropna(axis='index', how='all', subset=['last','email'])

In [None]:
df.isna()

In [None]:
df.fillna(0)

In [None]:
df['age'].mean()

In [None]:
df['age'] = df['age'].astype(float)

In [None]:
df.dtypes

In [None]:
df['age'].mean()

## Working with StackOverflow Data

In [None]:
from datetime import datetime

In [None]:
d_parser = lambda x: datetime.strptime(x, '%Y-%m-%d %I-%p')
dfs = pd.read_csv('../data/survey_results_public.csv', index_col='ResponseId')
dfs_schema = pd.read_csv('../data/survey_results_schema.csv', index_col='qname')


In [None]:
pd.set_option('display.max_columns', 85)
pd.set_option('display.max_rows', 85)

In [None]:
dfs.head()

In [None]:
dfs['YearsCode'].head(10)

In [None]:
dfs['YearsCode'].unique()

In [None]:
dfs['YearsCode'].replace('Less than 1 year', 0, inplace=True)

In [None]:
dfs['YearsCode'].replace('More than 50 years', 51, inplace=True)

In [None]:
dfs['YearsCode'] = dfs['YearsCode'].astype(float)

In [None]:
dfs['YearsCode'].mean()

In [None]:
dfs['YearsCode'].median()

## (Part 10) - Working with Dates and Time Series Data

In [None]:
from datetime import datetime

In [None]:
d_parser = lambda x: datetime.strptime(x, '%Y-%m-%d %I-%p')
dft = pd.read_csv('../data/ETH_1h.csv', parse_dates=['Date'], date_parser=d_parser)

In [None]:
dft.head()

In [None]:
# dft.loc[0, 'Date'].day_name()

In [None]:
# dft['Date'] = pd.to_datetime(dft['Date'], format='%Y-%m-%d %I-%p')

In [None]:
# dft['Date']

In [None]:
dft.loc[0, 'Date'].day_name()

In [None]:
dft['Date'].dt.day_name()

In [None]:
dft['DayOfWeek'] = dft['Date'].dt.day_name()
dft

In [None]:
dft['Date'].min()

In [None]:
dft['Date'].max()

In [None]:
dft['Date'].max() - dft['Date'].min()

In [None]:
filt = (dft['Date'] >= pd.to_datetime('2019-01-01')) & (dft['Date'] < pd.to_datetime('2020-01-01'))
dft.loc[filt]

In [None]:
dft.set_index('Date', inplace=True)

In [None]:
dft

In [None]:
dft.loc['2019']

In [None]:
dft['2020-01':'2020-02']

In [None]:
dft['2020-01':'2020-02']['Close'].mean()

In [None]:
dft['2020-01-01':'2020-01-01']['High'].max()

In [None]:
highs = dft['High'].resample('D').max()
highs['2020-01-01']

In [None]:
#%matplotlib inline
#highs.plot()

In [None]:
dft.resample('W').mean()
dft

In [None]:
dft.resample('W').agg({'Close':'mean','High':'max', 'Low':'min', 'Volume':'sum'})

##  ########### END ########### ##