In [None]:
# These lines import the Numpy and Datascience modules.
import numpy as np
import pandas as pd

# These lines do some fancy plotting magic.
import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')
import warnings
warnings.simplefilter('ignore', FutureWarning)

# DATA/CS/Stat 11800 Intro to Data Science 1
## Fall 2021 - Now, with Pandas!

##  Demos for Lecture 3 - Arrays and Dataframes 


## Sequences: lists & arrays

In [None]:
#let's create a list of fruit - use square brackets
fruits = ['apple', 'banana','cherry','durian']
fruits

In [None]:
#you can select items from a list (starting with 0)
type(fruits[1])

In [None]:
#you can also extract a "slice" of a list
# up to but not including the end of the slice
fruits[1:3]

In [None]:
#slicing for 1d arrays is [start: end: step] with defaults 0, size, 1
fruits[::2]

In [None]:
#you can change the size of a list

fruits.append(5)
fruits

In [None]:
fruits.insert(4,'tomato')
fruits.

## Arrays

In [None]:
#create an array - converting a python list to a numpy array
my_array = np.array([1,2,3,4])
print(my_array)

In [None]:
print(sum(my_array))
print(my_array.sum())

In [None]:
#add two arrays
my_array2 = np.array([5,6,7,8])
my_array + my_array2

In [None]:
print(my_array.size)
print(len(my_array))

Arrays require items to all be of the same type.  Lists allow different types.

In [None]:
my_other_list = [1,2,3,4,'banana']
my_other_list

In [None]:
my_other_array = np.array([1.0,'this is kinda confusing'])
print(my_other_array)

There are many methods that can be applied to arrays

In [None]:
np.count_nonzero(np.array([1,2,0,2,1,0,2]))

Ranges can be very handy, for creating data.  The method 'arange()' creates a half-closed interval \[start,end) - the end value is not included

In [None]:
print(np.arange(4,10))

In [None]:
#if you leave out the start, the default is zero; 
#if you leave out the step, the deault is one
print(np.arange(10))

In [None]:
print(np.arange(1,31,2))

Here's an example of arithmetic on arrays (must be the same size)

In [None]:
prices = np.array([500,210,300])
discount = np.array([200,10,100])
final_price = prices - discount

print("prices:", prices)
print("discount:", discount)
print()
print("final prices are " + str(final_price))

In [None]:
discounted = final_price / prices 
print("final prices are " + str(np.round(discounted,3)) + " of original")

## DataFrames (a.k.a. Tables)##

In [None]:
df1 = pd.DataFrame(
    {"a": [4,5,6],
     "b": [7,8,9],
     "c": [10,11,12]},
    index = [1,2,3]
    )
df1

In [None]:
# or if you leave out the row index Pandas creates it by default
df2 = pd.DataFrame(
    {"a": [4,5,6],
     "b": [7,8,9],
     "c": [10,11,12]}
    )
df2

In [None]:
#alternative syntax
df3 = pd.DataFrame(
    [[4,5,6],
     [7,8,9],
     [10,11,12]],
    index = [1,2,3],
    columns=["a","b","c"]
    )
df3

In [None]:
cones = pd.read_csv('cones.csv')

In [None]:
#What columns are there in the table?
cones.columns

In [None]:
#Maybe we'd prefer a list?
cones.columns.tolist()

In [None]:
#how many rows?
len(cones)

In [None]:
#Or we could ask for both
cones.shape

In [None]:
#okay - lets look at the table contents
cones

In [None]:
#let's look at a column
cones.Flavor

In [None]:
#What's the average price of a cone?
print(cones.Price)
cones.Price.mean()

In [None]:
#Here's an alternative way to do it
cones['Flavor']

In [None]:
np.mean(cones['Price'])

In [None]:
#This notation is handy if you want to extract multiple columns
cones[['Color','Flavor']]

In [None]:
#Note, none of the above has altered our original cones dataframe
cones

In [None]:
#If you want to keep the result with fewer columns you need to assign it
#to a variable
cones_no_color = cones[['Flavor','Price']]

In [None]:
cones_no_color

### In the above we chose columns from the dataframe.  We can also choose rows

In [None]:
#You can also chose rows by slicing
cones[:3]

## Choosing rows based on values can be done with "loc" 

In [None]:
cones.Flavor == 'chocolate'

In [None]:
#loc uses the above to keep only those rows that are "True"
cones.loc[cones.Flavor == 'chocolate']

In [None]:
#you can do row selction with multiple conditions
cones.loc[(cones.Flavor == 'chocolate') & (cones.Price < 5.0)]

In [None]:
#You can also use this alternative syntax
cones.loc[cones['Flavor'] == 'chocolate']

## Sorting rows by values

In [None]:
#Default sort is ascending
cones.sort_values(by='Price')

In [None]:
#Can override that default if you like
cones.sort_values(by='Price', ascending=False)

In [None]:
#You don't have to sort by numeric values...
cones.sort_values(by='Flavor', ascending=False)

### NBA Salary Example


Let's use a different table - NBA player salaries (from 2015-16)

In [None]:
# This table can be found online: 
# https://www.statcrunch.com/app/index.php?dataid=1843341
# NBA players, 2015-2016 season
# I have it in my local directory
# You can use a shell command to look at the first few lines of the .csv
!head nba_salaries.csv

In [None]:
# Read it in but change the name of the last column
nba = pd.read_csv('nba_salaries.csv')
nba[:10]

### That last column name is  going to be hard to type so let's change it

In [None]:
nba = nba.rename(columns = {"'15-'16 SALARY":"SALARY"})
#nba.columns = ['PLAYER','POSITION','TEAM','SALARY']
nba[:10]

In [None]:
#nba.where('TEAM', 'Golden State Warriors').show()
nba.loc[nba.TEAM == 'Chicago Bulls']

In [None]:
nba.loc[nba.PLAYER == 'Stephen Curry']

In [None]:
#Who were the top 10 highest paid players
nba.sort_values(by='SALARY',ascending=False)[:10]

In [None]:
#Show the top 15 highest paid Point Guards in descending order of SALARY
nba.loc[nba.POSITION == 'PG'].sort_values(by='SALARY', ascending=False)[:15]

How complete is the data set? from: nba.com/news/faq: 

"How many players are on a NBA roster?
Each NBA team can have a maximum of 15 players, 13 of which can be active each game."

In [None]:
#So lets count players on teams
df = pd.DataFrame(nba.groupby('TEAM').TEAM.count())
df=df.rename(columns = {'TEAM':'COUNT'})
df=df.sort_values(by='COUNT',ascending=False)
df.reset_index(inplace=True)
df

Seems like it's not very complete - lots of teams with fewer than 15 (and then some with a bunch more)!!!

In [None]:
df.loc[df.COUNT < 15]

In [None]:
df.loc[df.COUNT > 15]

In [None]:
df.loc[df.COUNT == 15]

Seems like we need to understand this dataset a bit better before we draw too many conclusions from it!   We'll work on that next class.