# Imports

In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd


In [2]:
def showit(x):
    print(type(x))
    print()
    print(x)

## Read in the data

In [3]:
df = pd.read_csv("sample.csv")

In [4]:
df.sample(frac=.0001, random_state=42)

Unnamed: 0,size,age,team,win,date,prob,grade
33553,medium,70,red,False,2023-12-03,0.507487,79.609314
9427,big,26,blue,False,2023-10-23,0.505165,69.569005
199,small,50,red,True,2023-08-23,0.847352,88.857845
12447,big,62,red,True,2023-12-02,0.530678,90.695167
39489,medium,39,blue,False,2023-11-23,0.518361,80.950326


In [5]:
df.columns

Index(['size', 'age', 'team', 'win', 'date', 'prob', 'grade'], dtype='object')

In [6]:
df.index

RangeIndex(start=0, stop=50000, step=1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   size    50000 non-null  object 
 1   age     50000 non-null  int64  
 2   team    50000 non-null  object 
 3   win     50000 non-null  bool   
 4   date    50000 non-null  object 
 5   prob    50000 non-null  float64
 6   grade   50000 non-null  float64
dtypes: bool(1), float64(2), int64(1), object(3)
memory usage: 2.3+ MB


In [8]:
df.describe()

Unnamed: 0,age,prob,grade
count,50000.0,50000.0,50000.0
mean,50.97078,0.466941,82.408342
std,19.32833,0.289434,7.452343
min,18.0,0.0,51.414152
25%,34.0,0.21522,77.383642
50%,51.0,0.467223,82.391035
75%,68.0,0.714926,87.382418
max,84.0,1.0,113.617684


In [9]:
df[['team']].describe()

Unnamed: 0,team
count,50000
unique,4
top,green
freq,12583


In [10]:
df.shape

(50000, 7)

## Filter rows

In [11]:
df.iloc[2,2]

'green'

In [12]:
df.iloc[5]

size            big
age              51
team         yellow
win            True
date     2023-12-14
prob       0.708568
grade     81.645377
Name: 5, dtype: object

In [13]:
df.iloc[[5,26]]

Unnamed: 0,size,age,team,win,date,prob,grade
5,big,51,yellow,True,2023-12-14,0.708568,81.645377
26,big,44,yellow,False,2023-12-06,0.87749,77.498612


In [14]:
df['size'] == 'medium'

0        False
1        False
2        False
3         True
4        False
         ...  
49995    False
49996     True
49997    False
49998    False
49999    False
Name: size, Length: 50000, dtype: bool

In [15]:
df[df['size'] == 'medium']

Unnamed: 0,size,age,team,win,date,prob,grade
3,medium,73,blue,False,2023-10-16,0.982599,86.724554
8,medium,60,yellow,False,2024-04-12,0.678985,79.311407
13,medium,52,green,True,2023-11-14,0.177618,84.018867
14,medium,36,red,False,2024-03-11,0.312485,68.640865
15,medium,52,green,True,2024-03-06,0.362879,75.582323
...,...,...,...,...,...,...,...
49981,medium,20,green,False,2024-03-15,0.024315,79.965449
49986,medium,59,green,False,2024-05-08,0.602568,97.559009
49987,medium,75,green,False,2023-11-06,0.737011,83.910104
49994,medium,48,yellow,False,2023-09-22,0.721019,84.810945


In [16]:
df.loc[(df['size'] == 'medium') * (df['win'] == False)]

Unnamed: 0,size,age,team,win,date,prob,grade
3,medium,73,blue,False,2023-10-16,0.982599,86.724554
8,medium,60,yellow,False,2024-04-12,0.678985,79.311407
14,medium,36,red,False,2024-03-11,0.312485,68.640865
16,medium,65,yellow,False,2023-12-04,0.529683,84.926477
25,medium,43,green,False,2023-10-26,0.702130,81.054229
...,...,...,...,...,...,...,...
49980,medium,80,red,False,2024-01-22,0.131547,76.762536
49981,medium,20,green,False,2024-03-15,0.024315,79.965449
49986,medium,59,green,False,2024-05-08,0.602568,97.559009
49987,medium,75,green,False,2023-11-06,0.737011,83.910104


In [17]:
# df.query('grade >= 80 and grade < 90 and team = "red" and date >= "2024-01-01"')

In [18]:
df[['age']].quantile([0.25,0.3, 0.35])

Unnamed: 0,age
0.25,34.0
0.3,38.0
0.35,41.0


In [19]:
df[['age','grade','prob']].agg(['mean',"std"]).iloc[0]

age      50.970780
grade    82.408342
prob      0.466941
Name: mean, dtype: float64

In [20]:
df[['size']].nunique

<bound method DataFrame.nunique of          size
0         big
1         big
2         big
3      medium
4         big
...       ...
49995     big
49996  medium
49997   small
49998     big
49999   small

[50000 rows x 1 columns]>

In [21]:
df['size'].value_counts()

size
medium    16774
big       16666
small     16560
Name: count, dtype: int64

## Groupby

In [24]:
df_agg = df.groupby('team')[['grade','prob']].agg(['mean', 'std'])

In [25]:
df_agg

Unnamed: 0_level_0,grade,grade,prob,prob
Unnamed: 0_level_1,mean,std,mean,std
team,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
blue,82.47878,7.502601,0.509014,0.287375
green,82.37494,7.396862,0.429645,0.284609
red,82.34968,7.418476,0.500834,0.289893
yellow,82.42989,7.491316,0.428714,0.285885


In [29]:
df_agg.columns[1][1]

'std'

In [None]:
for c in df_agg.columns:
    print(c)

In [30]:
df_agg.columns = ['_'.join(c) for c in df_agg.columns]

['grade_mean', 'grade_std', 'prob_mean', 'prob_std']

In [35]:
df_agg.loc['green']['grade','mean']

82.3749398467787

## calculated columns