# PART 2:

In [1]:
import pandas as pd
import numpy as np

np.random.seed(123)

In [3]:
students = ['Sally', 'Jane', 'Suzie', 'Billy', 'Ada', 'John', 'Thomas',
            'Marie', 'Albert', 'Richard', 'Isaac', 'Alan']

In [4]:
math_grades = np.random.randint(low=60, high=100, size=len(students))
english_grades = np.random.randint(low=60, high=100, size=len(students))
reading_grades = np.random.randint(low=60, high=100, size=len(students))

In [5]:
df = pd.DataFrame({'name': students,
                   'math': math_grades,
                   'english': english_grades,
                   'reading': reading_grades,
                   'classroom': np.random.choice(['A', 'B'], len(students))})

# SUBSETTING BY 2 COLUMNS

In [5]:
df[['name']]
#passing a list through to get info

Unnamed: 0,name
0,Sally
1,Jane
2,Suzie
3,Billy
4,Ada
5,John
6,Thomas
7,Marie
8,Albert
9,Richard


In [5]:
df[['name', 'classroom']]

Unnamed: 0,name,classroom
0,Sally,A
1,Jane,B
2,Suzie,A
3,Billy,B
4,Ada,A
5,John,B
6,Thomas,A
7,Marie,A
8,Albert,A
9,Richard,A


In [7]:
#this will return a bool for whether or not the name starts with A
bools = df.name.str.startswith('A')
bools
#this is called "boolean mask"

0     False
1     False
2     False
3     False
4      True
5     False
6     False
7     False
8      True
9     False
10    False
11     True
Name: name, dtype: bool

In [8]:
df[bools]
#returns only the names that begin with A in a dataframe

Unnamed: 0,name,math,english,reading,classroom
4,Ada,77,92,98,A
8,Albert,92,62,87,A
11,Alan,92,62,72,A


# . loc
- df.loc[row_indexer, column_indexer]

- is inclusive of column_indexer (all the way up to and INCLUDING)
- can be string label or index
- remember: Label = Loc

In [6]:
df.loc[:, 'math': 'reading']
#      ^row(all), ^columns
#INCLUSIVE so 'reading' is included in search

Unnamed: 0,math,english,reading
0,62,85,80
1,88,79,67
2,94,74,95
3,98,96,88
4,77,92,98
5,79,76,93
6,82,64,81
7,93,63,90
8,92,62,87
9,69,80,94


In [13]:
df.loc[1, 'math': 'reading']
#      ^row(all), ^columns
#returns the 1 row only

math       88
english    79
reading    67
Name: 1, dtype: object

In [8]:
df.loc[1:5, 'math':'reading']
#includes 5, and includes reading

Unnamed: 0,math,english,reading
1,88,79,67
2,94,74,95
3,98,96,88
4,77,92,98
5,79,76,93


In [14]:
df.loc[bools, 'name':'reading']
#returns previously named variable "bools"

Unnamed: 0,name,math,english,reading
4,Ada,77,92,98
8,Albert,92,62,87
11,Alan,92,62,72


# .iloc
- SINTAX: df.iloc[row_indexer, column_indexer]
- all the way but NOT including the "row indexer"
- indexer starts with 0
- remeber: integer('i'loc) = location

In [15]:
df.iloc[:3]
#^ this only includes the row_indexer since there is no ","

Unnamed: 0,name,math,english,reading,classroom
0,Sally,62,85,80,A
1,Jane,88,79,67,B
2,Suzie,94,74,95,A


In [17]:
df.iloc[:3, 1:3]
#^ now this includes row_indexer AND column_indexer

Unnamed: 0,math,english
0,62,85
1,88,79
2,94,74


In [10]:
df.iloc[1:3, 1:4]

Unnamed: 0,math,english,reading
1,88,79,67
2,94,74,95


### this is the first 3 [row_index] (0,1, 2) AND columns 1,2
###  does not include column 3 [1:3] because it is EXLUSIVE

### (ex): df.iloc[:3, 1:3]
- column 0 = name, 
- column 1= math, 
- column 2= english, 
- column 3 = reading

# AGGREGATING:
- .agg

In [13]:
df.agg(['min', 'max'])
#.agg alone will run the entire dataframe

Unnamed: 0,name,math,english,reading,classroom
min,Ada,62,62,67,A
max,Thomas,98,99,98,B


In [15]:
df.agg(['min', 'max']).T
#this is how you TRANSPOSE to help it make better sense

Unnamed: 0,min,max
name,Ada,Thomas
math,62,98
english,62,99
reading,67,98
classroom,A,B


In [17]:
df.describe()

Unnamed: 0,math,english,reading
count,12.0,12.0,12.0
mean,84.833333,77.666667,86.5
std,11.134168,13.371158,9.643651
min,62.0,62.0,67.0
25%,78.5,63.75,80.75
50%,90.0,77.5,89.0
75%,92.25,86.75,93.25
max,98.0,99.0,98.0


In [16]:
df.describe().T
#reshape your data

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
math,12.0,84.833333,11.134168,62.0,78.5,90.0,92.25,98.0
english,12.0,77.666667,13.371158,62.0,63.75,77.5,86.75,99.0
reading,12.0,86.5,9.643651,67.0,80.75,89.0,93.25,98.0


In [18]:
df.reading.agg('min')
#this will give "min" grade in reading column

67

In [20]:
df.math.agg('max')
#gives the "max" grade in math column

98

In [14]:
df.reading.agg(['min','max'])
#list will return as this

min    67
max    98
Name: reading, dtype: int64

In [22]:
df[['english', 'reading', 'math']].agg(['mean', 'min', 'max'])
#SUPER POWERFUL
#this returns mean, min and max in all three columns

Unnamed: 0,english,reading,math
mean,77.666667,86.5,84.833333
min,62.0,67.0,62.0
max,99.0,98.0,98.0


# GROUPBY
- .groupby

In [23]:
df.groupby('classroom').math.max()
#this groups by classroom. then gives the "math" max in EACH class

classroom
A    94
B    98
Name: math, dtype: int64

# GROUPBY WITH AGGREGATION

In [25]:
df.groupby('classroom').math.agg(['min', 'mean', 'max'])
#this groups by classroom then gives the min,mean and max of math for EACH class

Unnamed: 0_level_0,min,mean,max
classroom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,62,82.625,94
B,79,89.25,98


## RENAME COLUMNS

In [19]:
groupby1 = df.groupby('classroom').math.agg(['min', 'mean', 'max'])
groupby1.columns = ['math_min_grade', 'math_avg_grade', 'math_max_grade']
groupby1

Unnamed: 0_level_0,math_min_grade,math_avg_grade,math_max_grade
classroom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,62,82.625,94
B,79,89.25,98


## using NP.WHERE
- create a new column with conditions at the same time
- create new column based of data from another existing column
- SINTAX: np.where(condition, this_where_True, this_where_False)

In [7]:
np.where(df.math < 70, 'failing', 'passing')

array(['failing', 'passing', 'passing', 'passing', 'passing', 'passing',
       'passing', 'passing', 'passing', 'failing', 'passing', 'passing'],
      dtype='<U7')

In [8]:
df['passing_math'] = np.where(df.math < 70, 'failing', 'passing')
#this means: create a new column called "passing math"
# where math score is less then 70
# condition met = failing
# condition NOT met = passing

In [29]:
df

Unnamed: 0,name,math,english,reading,classroom,passing_math
0,Sally,62,85,80,A,failing
1,Jane,88,79,67,B,passing
2,Suzie,94,74,95,A,passing
3,Billy,98,96,88,B,passing
4,Ada,77,92,98,A,passing
5,John,79,76,93,B,passing
6,Thomas,82,64,81,A,passing
7,Marie,93,63,90,A,passing
8,Albert,92,62,87,A,passing
9,Richard,69,80,94,A,failing


In [34]:
grade_groups = df.groupby(['passing_math' , 'classroom']).reading.agg(['mean', 'count'])
grade_groups
#group by how many passing math and classroom
#THEN gives average reading grade and count of students passing math
#this includes the COUNT

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count
passing_math,classroom,Unnamed: 2_level_1,Unnamed: 3_level_1
failing,A,87.0,2
passing,A,87.166667,6
passing,B,85.25,4


## TAKEAWAYS:
- this is where you put your data into basic english

## CLEAN UP COLUMN NAMES:

In [35]:
grade_groups.columns = ['avg_reading_grade', 'count_of_students']
grade_groups
#renames the columns into something more easily understood

Unnamed: 0_level_0,Unnamed: 1_level_0,avg_reading_grade,count_of_students
passing_math,classroom,Unnamed: 2_level_1,Unnamed: 3_level_1
failing,A,87.0,2
passing,A,87.166667,6
passing,B,85.25,4


## .TRANSFORM
- will give you repetitious data but can be needed

In [36]:
df.assign(avg_math_score_by_classroom=df.groupby('classroom').math.transform('mean'))
# create new column using ".assign"
# does NOT save. would have to reassign variable to get it to save

Unnamed: 0,name,math,english,reading,classroom,passing_math,avg_math_score_by_classroom
0,Sally,62,85,80,A,failing,82.625
1,Jane,88,79,67,B,passing,89.25
2,Suzie,94,74,95,A,passing,82.625
3,Billy,98,96,88,B,passing,89.25
4,Ada,77,92,98,A,passing,82.625
5,John,79,76,93,B,passing,89.25
6,Thomas,82,64,81,A,passing,82.625
7,Marie,93,63,90,A,passing,82.625
8,Albert,92,62,87,A,passing,82.625
9,Richard,69,80,94,A,failing,82.625


In [24]:
df
#avg_math_by_classroom was not saved because ".assign" is for temporary views

Unnamed: 0,name,math,english,reading,classroom,passing_math
0,Sally,62,85,80,A,failing
1,Jane,88,79,67,B,passing
2,Suzie,94,74,95,A,passing
3,Billy,98,96,88,B,passing
4,Ada,77,92,98,A,passing
5,John,79,76,93,B,passing
6,Thomas,82,64,81,A,passing
7,Marie,93,63,90,A,passing
8,Albert,92,62,87,A,passing
9,Richard,69,80,94,A,failing


## combining GROUPBY and DESCRIBE():

In [38]:
df.groupby('classroom').reading.describe()
# this gives summary stats on reading grouped by classrooms
# meaning... avg, min, max, etc PER class

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
classroom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,8.0,87.125,8.88719,72.0,80.75,88.5,94.25,98.0
B,4.0,85.25,12.392874,67.0,82.75,90.5,93.0,93.0


# MERGING AND JOINING
- multiple ways to do this
- row wise = vertical concat
- SINTAX: pd.concat([df1,df2], axis=0)
- if list contains DF, you will get a DF as return
- DEFAULT = outer join, axis = 0

## pd.concat
- takes list or dict from series or DF and joins them along an axis

In [25]:
df1 = pd.DataFrame({'a': [1, 2, 3]})
df2 = pd.DataFrame({'a': [4, 5, 6]})
#creates 2 seperate, 1 column DataFrames

In [42]:
df1

Unnamed: 0,a
0,1
1,2
2,3


In [43]:
df2

Unnamed: 0,a
0,4
1,5
2,6


In [26]:
pd.concat([df1, df2])
#concat by row = STACK them
#creates a stacked one column dataframe

#keeps original index (0-1, 1-2, 2-3 THEN 0-4, 1-5, 2-6)

Unnamed: 0,a
0,1
1,2
2,3
0,4
1,5
2,6


In [28]:
#to join them together continuously and IGNORE index
concat_df1 = pd.concat([df1, df2], ignore_index= True)
concat_df1

Unnamed: 0,a
0,1
1,2
2,3
3,4
4,5
5,6


In [29]:
concat_df2 = pd.DataFrame({'b': [1,2,3,4,5,6]})
concat_df2
#this forces the column and row to have given name

Unnamed: 0,b
0,1
1,2
2,3
3,4
4,5
5,6


In [50]:
pd.concat([concat_df1, concat_df2], axis=1)
#passing list of two dataframes
#this joins list together and adds new column with index

Unnamed: 0,a,b
0,1,1
1,2,2
2,3,3
3,4,4
4,5,5
5,6,6


In [31]:
pd.concat([concat_df1, df1], axis=1)
#concat two DFs that are different sizes!
#concat_df1 has 6 rows
#df1 has 3 rows

Unnamed: 0,a,a.1
0,1,1.0
1,2,2.0
2,3,3.0
3,4,
4,5,
5,6,


## MERGING:
- .merge

- SINTAX: left_df.merge(right_df, how='inner', on=None, left_on=None, right_on=None, left_index=False, right_index=False, indicator=False)
- how == Type of merge to be performed.

- how=left: use only keys from left frame, similar to a SQL left outer join; preserve key order.

- how=right: use only keys from right frame, similar to a SQL right outer join; preserve key order.

- how=outer: use union of keys from both frames, similar to a SQL full outer join; sort keys lexicographically.

- how=inner: use intersection of keys from both frames, similar to a SQL inner join; preserve the order of the left keys.

In [32]:
#create users DateFrame
users = pd.DataFrame({
    'id': [1, 2, 3, 4, 5, 6],
    'name': ['bob', 'joe', 'sally', 'adam', 'jane', 'mike'],
    'role_id': [1, 2, 3, 3, np.nan, np.nan]
})
users

Unnamed: 0,id,name,role_id
0,1,bob,1.0
1,2,joe,2.0
2,3,sally,3.0
3,4,adam,3.0
4,5,jane,
5,6,mike,


In [33]:
#create roles DataFrame
roles = pd.DataFrame({
    'id': [1, 2, 3, 4],
    'name': ['admin', 'author', 'reviewer', 'commenter']
})
roles

Unnamed: 0,id,name
0,1,admin
1,2,author
2,3,reviewer
3,4,commenter


## full OUTER JOIN
### "role_id" on users has same info as "id" on roles

In [38]:
users.merge(roles, left_on='role_id', right_on='id', how='outer', indicator=True)
#users=left dataframe
#roles= right dataframe
#if the title has a duplicate will put "_x" after title

Unnamed: 0,id_x,name_x,role_id,id_y,name_y,_merge
0,1.0,bob,1.0,1.0,admin,both
1,2.0,joe,2.0,2.0,author,both
2,3.0,sally,3.0,3.0,reviewer,both
3,4.0,adam,3.0,3.0,reviewer,both
4,5.0,jane,,,,left_only
5,6.0,mike,,,,left_only
6,,,,4.0,commenter,right_only


In [40]:
#SAME join as above but "cleaner"
#rename and getting rid of duplicates
temp = (users.merge(roles, 
            left_on='role_id', 
            right_on='id', 
            how='outer')
    .drop(columns='role_id')
    .rename(columns={'id_x': 'id', 
                     'name_x': 'employee',
                     'id_y': 'role_id',
                     'name_y': 'role'}
            )
)
temp

Unnamed: 0,id,employee,role_id,role
0,1.0,bob,1.0,admin
1,2.0,joe,2.0,author
2,3.0,sally,3.0,reviewer
3,4.0,adam,3.0,reviewer
4,5.0,jane,,
5,6.0,mike,,
6,,,4.0,commenter


In [41]:
temp.info()
#shows the data type of each column

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7 entries, 0 to 6
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   id        6 non-null      float64
 1   employee  6 non-null      object 
 2   role_id   5 non-null      float64
 3   role      5 non-null      object 
dtypes: float64(2), object(2)
memory usage: 280.0+ bytes


# PART III:

# RESHAPING:
- pd.crosstab

In [9]:
df

Unnamed: 0,name,math,english,reading,classroom,passing_math
0,Sally,62,85,80,A,failing
1,Jane,88,79,67,B,passing
2,Suzie,94,74,95,A,passing
3,Billy,98,96,88,B,passing
4,Ada,77,92,98,A,passing
5,John,79,76,93,B,passing
6,Thomas,82,64,81,A,passing
7,Marie,93,63,90,A,passing
8,Albert,92,62,87,A,passing
9,Richard,69,80,94,A,failing


In [10]:
pd.crosstab(df.passing_math, df.classroom)
#reshape into passing math by classroom

classroom,A,B
passing_math,Unnamed: 1_level_1,Unnamed: 2_level_1
failing,2,0
passing,6,4


In [13]:
pd.crosstab(df.passing_math, df.classroom, margins=True)
#includes a new column with ALL 
#default for margins is False

classroom,A,B,All
passing_math,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
failing,2,0,2
passing,6,4,10
All,8,4,12


In [14]:
pd.crosstab(df.passing_math, df.classroom, normalize=True)

classroom,A,B
passing_math,Unnamed: 1_level_1,Unnamed: 2_level_1
failing,0.166667,0.0
passing,0.5,0.333333


In [15]:
pd.crosstab(df.passing_math, df.classroom, normalize=True).round(3)
#cleans it up and rounds to 3 decimals

classroom,A,B
passing_math,Unnamed: 1_level_1,Unnamed: 2_level_1
failing,0.167,0.0
passing,0.5,0.333


### .pivot_table
- have to supply 3 things:
    - which values make up rows (the index)
    - which values will make up columns
    - values we are aggregating

In [16]:
df.pivot_table(index='classroom', columns='passing_math', values='math')
#by default it will aggregate the Mean function

passing_math,failing,passing
classroom,Unnamed: 1_level_1,Unnamed: 2_level_1
A,65.5,88.333333
B,,89.25


In [17]:
n = 40

orders = pd.DataFrame({
    'drink': np.random.choice(['Tea', 'Water', 'Water'], n),
    'meal': np.random.choice(['Curry', 'Yakisoba Noodle', 'Pad Thai'], n),
})

orders.sample(10) # <--- just gives 10 random
#creating a dataframe with two columns (drink and meal)
#n=40, means 40 rows

Unnamed: 0,drink,meal
0,Tea,Pad Thai
38,Water,Curry
11,Tea,Curry
4,Tea,Curry
16,Water,Curry
2,Tea,Yakisoba Noodle
22,Water,Yakisoba Noodle
5,Water,Pad Thai
19,Tea,Pad Thai
15,Water,Yakisoba Noodle


### .map
- similar to numpy 'where' function
- creates a new column with values from another column

In [18]:
#create prices dictionary that holds prices for meal or drink
prices = {
    'Yakisoba Noodle': 9,
    'Curry': 11,
    'Pad Thai': 10,
    'Tea': 2,
    'Water': 0,
}

In [19]:
#create new column called bill
orders['bill'] = orders.drink.map(prices) + orders.meal.map(prices)

orders.sample(10)

Unnamed: 0,drink,meal,bill
6,Water,Pad Thai,10
31,Water,Curry,11
7,Water,Yakisoba Noodle,9
32,Water,Yakisoba Noodle,9
0,Tea,Pad Thai,12
22,Water,Yakisoba Noodle,9
38,Water,Curry,11
15,Water,Yakisoba Noodle,9
21,Tea,Yakisoba Noodle,11
29,Tea,Yakisoba Noodle,11


In [20]:
pd.crosstab(orders.drink, orders.meal)
#for each person who had a specific drink, how many had a specific meal

meal,Curry,Pad Thai,Yakisoba Noodle
drink,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tea,3,4,5
Water,9,10,9


In [21]:
pd.crosstab(orders.drink, orders.meal, normalize=True, margins=True)
#same as above but with totals (margins)

meal,Curry,Pad Thai,Yakisoba Noodle,All
drink,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Tea,0.075,0.1,0.125,0.3
Water,0.225,0.25,0.225,0.7
All,0.3,0.35,0.35,1.0


In [22]:
orders.pivot_table(index='drink', columns='meal', values='bill')
#find average bill amount

meal,Curry,Pad Thai,Yakisoba Noodle
drink,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Tea,13,12,11
Water,11,10,9


In [23]:
#another way to find average bill amount
orders.groupby(['drink', 'meal']).bill.mean()
# ^-- this is called multilevel groupby

drink  meal           
Tea    Curry              13
       Pad Thai           12
       Yakisoba Noodle    11
Water  Curry              11
       Pad Thai           10
       Yakisoba Noodle     9
Name: bill, dtype: int64

### TRANSPOSING:
- .T

In [24]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
name,Sally,Jane,Suzie,Billy,Ada,John,Thomas,Marie,Albert,Richard,Isaac,Alan
math,62,88,94,98,77,79,82,93,92,69,92,92
english,85,79,74,96,92,76,64,63,62,80,99,62
reading,80,67,95,88,98,93,81,90,87,94,93,72
classroom,A,B,A,B,A,B,A,A,A,A,B,A
passing_math,failing,passing,passing,passing,passing,passing,passing,passing,passing,failing,passing,passing


In [25]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
math,12.0,84.833333,11.134168,62.0,78.5,90.0,92.25,98.0
english,12.0,77.666667,13.371158,62.0,63.75,77.5,86.75,99.0
reading,12.0,86.5,9.643651,67.0,80.75,89.0,93.25,98.0
