In [None]:
# we'll import these three packages

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# pandas

## Importing Data

Usually tabular data is stored in csv files
- **C**omma **S**eparated **V**alues
- Can export Excel data as a csv file!

In [None]:
# read in a csv file using pandas
df = pd.read_csv('data/marathon_results_2015.csv')

In [None]:
# Who's that object???

type(df)

# It's a DataFrame!

In [None]:
# Jupyter (and other software that reads ipynb files) displays a df nicely
df

In [None]:
# let's import a dataframe from the seaborn package
import seaborn as sns

df = sns.load_dataset('diamonds')

# if you don't have seaborn you can use the following command to pull in the same data
#df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/diamonds.csv')

# see the first few rows
df.head()

In [None]:
# see the last few rows
df.tail()

In [None]:
# Just one!
df.head(1)

In [None]:
# let's learn some more about this data
df.columns

In [None]:
df.index

In [None]:
len(df)

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
# what datatype is each column?
df.dtypes

In [None]:
# we can create a dataframe from scratch in multiple ways

# here's a dictionary where the dictionary keys are the column names
# and the dictionary values are the column values

# what do you notice here about the output?
pd.DataFrame({
                "A": 1.0,
                "B": pd.Timestamp("20130102"),
                "C": pd.Series(1, index=list(range(4)), dtype="float32"),
                "D": np.array([3] * 4, dtype="int32"),
                "E": pd.Categorical(["test", "train", "test", "train"]),
                "F": "foo",
                })

In [None]:
# here's my favorite way

# a list of rows
rows = []

colors = ['red', 'blue', 'green']

for idx in range(5):
    new_dict = dict()
    new_dict['Student_Id'] = idx
    new_dict['Student Name'] = f'John{idx}'
    new_dict['Favorite Color'] = colors[idx % 3]
    
    # each row is a dictionary where the keys are the column names
    # and the values are the row values
    rows.append(new_dict)
    
pd.DataFrame(rows)

## Selecting data from a DataFrame

In [None]:
# select a column
df['carat']

In [None]:
type(df['carat'])

In [None]:
# can slice like a list
df[4:10]

In [None]:
# Use loc to get an individual rows/values
df.loc[10]

In [None]:
df.loc[10, 'carat']

In [None]:
# can pass lists to loc
df.loc[[1, 3, 5]]

In [None]:
df.loc[[1, 3,5], ['carat', 'cut']]

In [None]:
# can also use to slice
df.loc[10:15]

In [None]:
# the method .iloc uses position
df.iloc[10:15, 0:3]

In [None]:
small_df = df.iloc[5:20]
# remember that these methods don't alter "df", but output a new DataFrame object
small_df

What is the difference between ```.loc[5]``` and ```.iloc[5]``` using ```small_df```? Why?

In [None]:
small_df.iloc[5]

In [None]:
# Boolean indexing
df[ df['carat'] > 1.5 ]

In [None]:
# what is going on here?
df['carat'] > 1.5

## Exploration

In [None]:
df = pd.read_csv('data/marathon_results_2015.csv')
df

In [None]:
# is a value missing?
pd.isna(df.loc[0:10, ['Name']])

In [None]:
# apply to whole dataframe
df.isna()

In [None]:
# sum each column
# remember that F = 0, T = 1
# what does the output mean?
df.isna().sum()

In [None]:
# drop rows with missing values
df.dropna(axis=0)

In [None]:
# drop columns with missing values
df.dropna(axis=1)

In [None]:
# nice built-in plotting functions
df['Age'].hist(bins=5)

In [None]:
# transform the data
df['M/F'].value_counts()

In [None]:
# mean of the Age, grouped by Country
df[['Country','Age']].groupby('Country').mean()

In [None]:
df[['Country','Age']].groupby('Country', as_index=False).mean()

In [None]:
# if I now want to manipulate this dataframe I need to assign it to a variable name
age_avg = df[['Country','Age']].groupby('Country', as_index=False).mean()

In [None]:
# sort by average age!
age_avg.sort_values(by='Age')

This outputs a series of True/False values the same length as the DataFrame.
- Look at ```small_df```
- Print out ```small_df['color' == 'I']```
- Think about why ```small_df[small_df['color' == 'I']``` outputs a DataFrame with rows where the color is labelled ```I```.

## Lab 1 Part 1: Pandas

Check out the penguins dataset for the following 5 questions.

``` df = sns.load_dataset('penguins')```

``` df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv')```


1. What are the variables in this dataset? Which are categorical? Which are continuous? Discrete? (Use pandas commands to answer these questions!)
2. Which columns have missing values? How many?
3. Create a new column giving the body mass in *pounds*. (Hint: you may need to use Google!)
4. How many species types are there? How many penguins are there for each species type?
5. Which species type, on average, weighs more? Does there seem to be a difference between male and female weight?
6. Sort the penguins by bill length.
7. Create a dataframe of only penguins of species "Adelie".
8. Generate a hypothesis about the data, then answer it! (see Q4 above for example)

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/mwaskom/seaborn-data/master/penguins.csv')

In [None]:
df

# matplotlib

- Let's start with some np.arrays!

In [None]:
x = np.array([1,2,3,4])
x

In [None]:
type(x)

In [None]:
# most useful np.array method!
x.shape

In [None]:
x = np.array([[1,2],[3,4],[5,6]])
x.shape

In [None]:
x

In [None]:
# What is this doing?
x = np.random.randint(low=1, high=10, size=10)
x

In [None]:
# and this?
x = np.random.rand(100)
x

Now let's do some plotting

In [None]:
# a simple line chart
plt.plot([0,1,2,3,4], [2,3,5,10,15])

In [None]:
# better to use plt.show rather than "printing" the line chart
plt.plot([0,1,2,3,4], [2,3,5,10,15])
plt.show()
plt.plot([0,1,2,3,4], [1,1,1,1,1])
plt.show()

In [None]:
# scatter plot!
plt.plot([0,1,2,3,4], [2,3,5,10,15], 'ro')
plt.show()

In [None]:
# plt takes in np arrays!
# np.arange(k) makes a sequence of k numbers
plt.plot(np.arange(5), np.random.randint(10, size=5))
plt.show()

start = 0
stop = 10
num_pts = 50
# linspace creates a sequence of points using a start, stop, and number of points option
plt.plot(np.linspace(start, stop, num_pts), np.random.randint(10, size=50), 'ro')
plt.show()

In [None]:
plt.plot(np.linspace(start, stop, num_pts), np.random.randint(10, size=50), 'ro')

# easy to format the graph!
plt.xlabel(f'{num_pts} numbers from {start} to {stop}')
plt.ylabel('Random integers!')
plt.title('This is a Scatter Plot')
plt.show()

In [None]:
# create a figure and one subplot
fig, ax = plt.subplots()
plt.show()

In [None]:
fig, ax = plt.subplots()

# populate the subplot
ax.plot(np.linspace(start, stop, num_pts), np.random.randint(10, size=50), 'ro')
ax.set_title('A single plot')

# show it
plt.show()

In [None]:
# multiple subplots
fig, axes = plt.subplots(nrows=5, ncols=1)

for idx in range(5):
    axes[idx].plot(np.linspace(start, stop, num_pts), np.random.randint(10, size=50), 'ro')

plt.show()

In [None]:
# a little bigger now
# figsize = (width, height)
fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(10,20))

for idx in range(5):
    for idx2 in range(2):
        # can use scatter instead of plot too
        axes[idx, idx2].scatter(np.linspace(start, stop, num_pts), np.random.randint(10, size=50))

plt.show()

In [None]:
# let's make some bar graphs
df = sns.load_dataset('diamonds')
color_counts = df['color'].value_counts()
color_counts

In [None]:
plt.bar(color_counts.index, color_counts.values)

## Lab 1 Part 2: Matplotlib

1. Plot the line ```y=2x``` from ```x=0``` to ```x=30``` using ```plt.plot```.
2. Create a scatter plot where the x values are the first 100 integers in ascending order and the y values are the first 100 integers in descending order.

Use the penguins dataset from Part 1 for the following questions

3. Create *one* figure with four histograms, one for each of the numeric variables.
4. Create a bar chart to answer the questions "How many penguins are there for each species type?"
5. Make the bar chart **horizontal**!
6. Make a scatter plot comparing two of numeric variables. Is there any relationship here?

BONUS: Incorporate a *third* variable by coloring the scatter plot points according to ```island```.