# Importing and inspecting data

In [None]:
import numpy as np
import pandas as pd

## Why Pandas?
NumPy is nice for handling homogeneous data types, but sometimes we need more flexibility as data become more complicated. We might also desire visually pleasing way to view the data.  

In [None]:
# Sample data (made up employees)
employee_data = np.array([
    [101, 'John', 'Engineering', 60000, '2018-01-15'],
    [102, 'Jane', 'Engineering', 65000, '2017-05-12'],
    [103, 'Doe', 'HR', 55000, '2019-02-28'],
    [104, 'Alice', 'Marketing', 70000, '2016-11-20'],
    [105, 'Bob', 'HR', 60000, '2019-09-10'],
    [106, 'Eve', 'Marketing', 75000, '2017-04-05']
])
print(employee_data)

# Same data in Pandas dataframe
employee_df = pd.DataFrame(employee_data, columns=['ID', 'Name', 'Department', 'Salary', 'Hire Date'])
employee_df['Salary'] = pd.to_numeric(employee_df['Salary'])
employee_df

In [None]:
# Get the average salary by department

# Find unique departments
unique_departments = np.unique(employee_data[:, 2])

# Calculate average salary for each department
avg_salaries = []
for department in unique_departments:
    department_salaries = employee_data[employee_data[:, 2] == department, 3].astype(float)
    avg_salaries.append(np.mean(department_salaries))

print(unique_departments)
print(avg_salaries)

In [None]:
# Do the same task with Pandas
avg_salaries = employee_df.groupby('Department')['Salary'].mean()
avg_salaries

## Import data

In [None]:
# read a csv in your working directory
df = 
df.head()

In [None]:
# read a csv online


In [None]:
# read in an excel file


In [None]:
# Read in an excel file online


### Inspecting data

In [None]:
# is the data frame empty?  returns true or false
df.

In [None]:
# what are the dimensions?  (returns as number of rows, number of columns)
df.

In [None]:
# what are the columns?  We know there are 26, but what are they?
df.

In [None]:
# inspect the top few rows
df.

In [None]:
# inspecting the last three rows
df.

In [None]:
# what data types are included?
df.

In [None]:
# info() gives more information, including the number of non-nulls
df.

## Methods

In [None]:
# obtain summary statistics for numeric columns
df.

In [None]:
# if we would like to just describe one column, such as mag (magnitude)
df.

In [None]:
# we can look for unique values in a column
df.

In [None]:
# Get the number in each category
df.

In [None]:
# mean of a column
df.

In [None]:
# median
df.

In [None]:
# quantile
df.

In [None]:
# sum of a column
df.

In [None]:
# min of a column
df.

In [None]:
# max of a column
df.

In [None]:
# method chaining (max of each type)
df.

### Subsetting Data

In [None]:
# attribute notation
df.

In [None]:
# dictionary notation
df

In [None]:
# selecting multiple columns
df

In [None]:
# chaining recursively to select specific columns and rows
df

In [None]:
# select all columns with object datatypes
df

In [None]:
# select all columns with numeric datatypes
df

### Indexing


In [None]:
# loc lets us make these selections without chaining-- note: row index is inclusive with loc
df.loc[0:3, ['mag', 'place']] # select colums mag and place and rows 0 through 3

In [None]:
# note: row index is exclusive with iloc
df.iloc[0:4, [8,13]]

### Filtering with conditions

In [None]:
# keep only the rows where this boolean statement is true (mag greater than or equal to 7)


In [None]:
# important columns for earthquakes with magnitude greater than or equal to 7 OR caused a tsunami


In [None]:
# Just get the earthquakes in California


In [None]:
# We might have missed some-- the USGS has tagged some locations as California and some as CA. USE REGEX!
cali_df 
cali_df

In [None]:
# if we just want the columns related to magnitude


### Finding the minimum and maximum
We might be interested in knowing the lowest and highest magnitude earthquakes which occured in California during the time frame the data frame represents, and also knowing where and when they occured.  Pandas lets us find the index of these extrema and then we can select the entire row.

In [None]:
# get the index of lowest and highest magnitude earthquakes in California
cali_df.mag..., cali_df.mag...

In [None]:
# ERROR! this gives us the POSITION index
cali_df.loc[
    [cali_df.mag.argmin(), cali_df.mag.argmax()],
    ['mag', 'title', 'tsunami', 'place']
]

In [None]:
# get the index LABEL of the lowest and highest magnitude earthquakes in Cali
cali_df.mag..., cali_df.mag...

In [None]:
# This allows us to indwex with loc
cali_df.loc[
    [cali_df.mag.idxmin(), cali_df.mag.idxmax()],
    ['mag', 'title', 'tsunami', 'place']
]

The largest quake in California was in Trinidad! 

## Plotting with Pandas

In [None]:
# histograms


In [None]:
# line plots


In [None]:
# scatter plots


In [None]:
# bar charts


## Activity 

Consider the following jokes:

1. Q: Why don't scientists trust atoms?
    1. Because they make up everything.
2. Q: What do you call fake spaghetti?
    1. An impasta!
3. Q: Why did the scarecrow win an award?
    1. Because he was outstanding in his field.


Create a Pandas dataframe with the jokes in one column, their answers in another column, and your rating of the joke on a scale of 0-5 stars (ints) in another column. 

Compute your average rating of these jokes.

Access the question and answer of your highest rated joke. (output should be a Pandas df with 1(or more) rows and two columns)