In [26]:
import pandas as pd
import numpy as np

np.random.seed(123)

In [2]:
students = ['Sally', 'Jane', 'Suzie', 'Billy', 'Ada', 'John', 'Thomas',
            'Marie', 'Albert', 'Richard', 'Isaac', 'Alan']

In [3]:
math_grades = np.random.randint(low=60, high=100, size=len(students))
english_grades = np.random.randint(low=60, high=100, size=len(students))
reading_grades = np.random.randint(low=60, high=100, size=len(students))

In [4]:
df = pd.DataFrame({'name': students,
                   'math': math_grades,
                   'english': english_grades,
                   'reading': reading_grades,
                   'classroom': np.random.choice(['A', 'B'], len(students))})

# SUBSETTING BY 2 COLUMNS

In [5]:
df[['name', 'classroom']]

Unnamed: 0,name,classroom
0,Sally,A
1,Jane,B
2,Suzie,A
3,Billy,B
4,Ada,A
5,John,B
6,Thomas,A
7,Marie,A
8,Albert,A
9,Richard,A


In [7]:
#this will return a bool for whether or not the name starts with A
bools = df.name.str.startswith('A')
bools

0     False
1     False
2     False
3     False
4      True
5     False
6     False
7     False
8      True
9     False
10    False
11     True
Name: name, dtype: bool

In [8]:
df[bools]
#returns only the names that begin with A in a dataframe

Unnamed: 0,name,math,english,reading,classroom
4,Ada,77,92,98,A
8,Albert,92,62,87,A
11,Alan,92,62,72,A


# . loc
#### df.loc[row_indexer, column_indexer]
#### is inclusive of column_indexer (all the way up to and INCLUDING)

In [12]:
df.loc[:, 'math': 'reading']
#      ^row(all), ^columns

Unnamed: 0,math,english,reading
0,62,85,80
1,88,79,67
2,94,74,95
3,98,96,88
4,77,92,98
5,79,76,93
6,82,64,81
7,93,63,90
8,92,62,87
9,69,80,94


In [13]:
df.loc[1, 'math': 'reading']
#      ^row(all), ^columns
#returns the 1 row only

math       88
english    79
reading    67
Name: 1, dtype: object

In [14]:
df.loc[bools, 'name':'reading']
#returns previously named variable "bools"

Unnamed: 0,name,math,english,reading
4,Ada,77,92,98
8,Albert,92,62,87
11,Alan,92,62,72


# .iloc
- SINTAX: df.iloc[row_indexer, column_indexer]
- all the way but NOT including the "row indexer"
- indexer starts with 0

In [15]:
df.iloc[:3]
#^ this only includes the row_indexer since there is no ","

Unnamed: 0,name,math,english,reading,classroom
0,Sally,62,85,80,A
1,Jane,88,79,67,B
2,Suzie,94,74,95,A


In [17]:
df.iloc[:3, 1:3]
#^ now this includes row_indexer AND column_indexer

Unnamed: 0,math,english
0,62,85
1,88,79
2,94,74


### this is the first 3 [row_index] (0,1, 2) AND columns 1,2
###  does not include column 3 [1:3] because it is EXLUSIVE

### (ex): df.iloc[:3, 1:3]
- column 0 = name, 
- column 1= math, 
- column 2= english, 
- column 3 = reading

# AGGREGATING:
- .agg

In [18]:
df.reading.agg('min')
#this will give "min" grade in reading column

67

In [20]:
df.math.agg('max')
#gives the "max" grade in math column

98

In [22]:
df[['english', 'reading', 'math']].agg(['mean', 'min', 'max'])
#SUPER POWERFUL
#this returns mean, min and max in all three columns

Unnamed: 0,english,reading,math
mean,77.666667,86.5,84.833333
min,62.0,67.0,62.0
max,99.0,98.0,98.0


# GROUPBY
- .groupby

In [23]:
df.groupby('classroom').math.max()
#this groups by classroom. then gives the "math" max in EACH class

classroom
A    94
B    98
Name: math, dtype: int64

# GROUPBY WITH AGGREGATION

In [25]:
df.groupby('classroom').math.agg(['min', 'mean', 'max'])
#this groups by classroom then gives the min,mean and max of math for EACH class

Unnamed: 0_level_0,min,mean,max
classroom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A,62,82.625,94
B,79,89.25,98


## using NP.WHERE
- create a new column with conditions at the same time

In [28]:
df['passing_math'] = np.where(df.math < 70, 'failing', 'passing')
#this means: create a new column called "passing math"
# where math score is less then 70
# condition = failing
# condition NOT met = passing

In [29]:
df

Unnamed: 0,name,math,english,reading,classroom,passing_math
0,Sally,62,85,80,A,failing
1,Jane,88,79,67,B,passing
2,Suzie,94,74,95,A,passing
3,Billy,98,96,88,B,passing
4,Ada,77,92,98,A,passing
5,John,79,76,93,B,passing
6,Thomas,82,64,81,A,passing
7,Marie,93,63,90,A,passing
8,Albert,92,62,87,A,passing
9,Richard,69,80,94,A,failing


In [34]:
grade_groups = df.groupby(['passing_math' , 'classroom']).reading.agg(['mean', 'count'])
grade_groups
#group by how many passing math and classroom
#THEN gives average reading grade and count of students passing math
#this includes the COUNT

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,count
passing_math,classroom,Unnamed: 2_level_1,Unnamed: 3_level_1
failing,A,87.0,2
passing,A,87.166667,6
passing,B,85.25,4


## CLEAN UP COLUMN NAMES:

In [35]:
grade_groups.columns = ['avg_reading_grade', 'count_of_students']
grade_groups
#renames the columns into something more easily understood

Unnamed: 0_level_0,Unnamed: 1_level_0,avg_reading_grade,count_of_students
passing_math,classroom,Unnamed: 2_level_1,Unnamed: 3_level_1
failing,A,87.0,2
passing,A,87.166667,6
passing,B,85.25,4


## .TRANSFORM

In [36]:
df.assign(avg_math_score_by_classroom=df.groupby('classroom').math.transform('mean'))

Unnamed: 0,name,math,english,reading,classroom,passing_math,avg_math_score_by_classroom
0,Sally,62,85,80,A,failing,82.625
1,Jane,88,79,67,B,passing,89.25
2,Suzie,94,74,95,A,passing,82.625
3,Billy,98,96,88,B,passing,89.25
4,Ada,77,92,98,A,passing,82.625
5,John,79,76,93,B,passing,89.25
6,Thomas,82,64,81,A,passing,82.625
7,Marie,93,63,90,A,passing,82.625
8,Albert,92,62,87,A,passing,82.625
9,Richard,69,80,94,A,failing,82.625


## combining GROUPBY and DESCRIBE():

In [38]:
df.groupby('classroom').reading.describe()
#this gives summary stats on reading grouped by classrooms
# meaning... avg, min, max, etc PER class

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
classroom,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
A,8.0,87.125,8.88719,72.0,80.75,88.5,94.25,98.0
B,4.0,85.25,12.392874,67.0,82.75,90.5,93.0,93.0


# MERGING AND JOINING
- multiple ways to do this


## pd.concat
- takes list or dict from series or DF and joins them along an axis

In [41]:
df1 = pd.DataFrame({'a': [1, 2, 3]})
df2 = pd.DataFrame({'a': [4, 5, 6]})
#creates 2 DataFrames

In [42]:
df1

Unnamed: 0,a
0,1
1,2
2,3


In [43]:
df2

Unnamed: 0,a
0,4
1,5
2,6


In [44]:
pd.concat([df1, df2])
#this just straight joins the DFs together
#keeps original index (0-1, 1-2, 2-3 THEN 0-4, 1-5, 2-6)

Unnamed: 0,a
0,1
1,2
2,3
0,4
1,5
2,6


In [46]:
#to join them together continuously and IGNORE index
concat_df1 = pd.concat([df1, df2], ignore_index= True)
concat_df1

Unnamed: 0,a
0,1
1,2
2,3
3,4
4,5
5,6


In [49]:
concat_df2 = pd.DataFrame({'b': [1,2,3,4,5,6]})
concat_df2
#this forces the column and row to have given name

Unnamed: 0,b
0,1
1,2
2,3
3,4
4,5
5,6


In [50]:
pd.concat([concat_df1, concat_df2], axis=1)
#this joins list together and adds new column with index

Unnamed: 0,a,b
0,1,1
1,2,2
2,3,3
3,4,4
4,5,5
5,6,6


## MERGING:
- .merge