In [1]:

import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('salaries.csv')
df

Unnamed: 0,Name,Salary,Age
0,John,50000,34
1,Sally,120000,45
2,Alyssa,80000,27


In [3]:
# You can select columns with a bracket call:
print(df['Name'])

0      John
1     Sally
2    Alyssa
Name: Name, dtype: object


In [4]:
print(df['Salary'])

0     50000
1    120000
2     80000
Name: Salary, dtype: int64


In [5]:
# Select multiple columns with a list of column names.
# Since you are passing in a list, you see two sets of []
print(df[['Name','Salary']])

     Name  Salary
0    John   50000
1   Sally  120000
2  Alyssa   80000


In [6]:
print(df["Salary"].min())

50000


In [7]:
print(df["Age"] >30)

0     True
1     True
2    False
Name: Age, dtype: bool


In [8]:
print(df[df["Age"] >30])

    Name  Salary  Age
0   John   50000   34
1  Sally  120000   45


In [9]:
# Similar to NumPy, you can create calls of min(), max(), mean(), etc...
# on a pandas dataframe.

print(df['Age'].mean())

35.333333333333336


In [10]:
# Just like Numpy, we can use conditional filtering to select rows that meet
# certain critera. Like choosing rows where the Age value is greater than 30

ser_of_bool = df['Age'] > 30
print(ser_of_bool)

0     True
1     True
2    False
Name: Age, dtype: bool


In [11]:
# Use this filter of booleans to then select the rows

age_filter = df['Age'] > 30

In [12]:
# Pass it to the dataframe
print(df[age_filter])

    Name  Salary  Age
0   John   50000   34
1  Sally  120000   45


In [13]:
# More commonly done all in one step:
df[df['Age'] > 30]

Unnamed: 0,Name,Salary,Age
0,John,50000,34
1,Sally,120000,45


In [14]:
# There are lots of other commands you can do with pandas!
# But for now, we'll just talk about a few more, and then introduce the rest
# as we continue through the course :)

df['Age'].unique() # list of unique values for Age

In [15]:
df['Age'].nunique() # number of unqiue values

3

In [16]:
df.info() # General info about your dataframe

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Salary  3 non-null      int64 
 2   Age     3 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 200.0+ bytes


In [17]:
df.describe() # Statistics about your dataframe

Unnamed: 0,Salary,Age
count,3.0,3.0
mean,83333.333333,35.333333
std,35118.845843,9.073772
min,50000.0,27.0
25%,65000.0,30.5
50%,80000.0,34.0
75%,100000.0,39.5
max,120000.0,45.0


In [18]:
df.columns # Grab a list of all columns

Index(['Name', 'Salary', 'Age'], dtype='object')

In [19]:
df.index # Create an index list

RangeIndex(start=0, stop=3, step=1)

In [14]:
# You can convert a numpy matrix to a dataframe with:
mat = np.arange(50).reshape(5,10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Salary  3 non-null      int64 
 2   Age     3 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 200.0+ bytes


In [20]:
pd.DataFrame(data =mat)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,1,2,3,4,5,6,7,8,9
1,10,11,12,13,14,15,16,17,18,19
2,20,21,22,23,24,25,26,27,28,29
3,30,31,32,33,34,35,36,37,38,39
4,40,41,42,43,44,45,46,47,48,49


In [24]:
# You can convert a numpy matrix to a dataframe with:
mat1 = np.arange(0,10).reshape(5,2)
pd.DataFrame(data =mat1 , columns = ["A", "B"])

Unnamed: 0,A,B
0,0,1
1,2,3
2,4,5
3,6,7
4,8,9
