In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.DataFrame(
    {
        'Name': ['John Smith', 'Jane Doe', 'Mary Hampton'],
        'Gender': ['M', 'F', 'F'],
        'Salary': np.array([70000, 55000, 87000])
     },
    index=[1, 2, 3],
)
df

Unnamed: 0,Name,Gender,Salary
1,John Smith,M,70000
2,Jane Doe,F,55000
3,Mary Hampton,F,87000


In [3]:
df['Salary'] = df['Salary'].astype(np.float32)
df

Unnamed: 0,Name,Gender,Salary
1,John Smith,M,70000.0
2,Jane Doe,F,55000.0
3,Mary Hampton,F,87000.0


In [4]:
# Another way to make a df is using pd.Dataframe.from_dict(dict)
# The difference is that from_dict has another argument "orient".
# orient='columns' -> normal df       orient='index' -> indices as features

data = {'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}
df1 = pd.DataFrame.from_dict(data)
df2 = pd.DataFrame.from_dict(data, orient='index')

In [5]:
df1

Unnamed: 0,A,B,C
0,1,4,7
1,2,5,8
2,3,6,9


In [6]:
df2

Unnamed: 0,0,1,2
A,1,2,3
B,4,5,6
C,7,8,9


In [7]:
# axis=0 means columns, axis=1 means rows.

df = pd.DataFrame([[1, 2, 3, 4, 5],
                   [10, 20, 30, 40, 50],
                   [100, 200, 300, 400, 500]],
                  columns=['A', 'B', 'C', 'D', 'E'],
                  index=['X', 'Y', 'Z']
)
df

Unnamed: 0,A,B,C,D,E
X,1,2,3,4,5
Y,10,20,30,40,50
Z,100,200,300,400,500


In [8]:
series1 = pd.Series([5, 10, 20], index=['X', 'Y', 'Z'])
df.add(series1, axis=0)

Unnamed: 0,A,B,C,D,E
X,6,7,8,9,10
Y,20,30,40,50,60
Z,120,220,320,420,520


In [9]:
series2 = pd.Series([5, 10, 15, 20, 25], index=['A', 'B', 'C', 'D', 'E'])
df.add(series2, axis=1)

Unnamed: 0,A,B,C,D,E
X,6,12,18,24,30
Y,15,30,45,60,75
Z,105,210,315,420,525


In [10]:
# To get access to display options, use pd.set_option() or pd.options.display.
pd.set_option('display.min_rows', 5)
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)
pd.reset_option('display.min_rows')

pd.options.display.min_rows = 5
pd.options.display.max_rows = 50
pd.options.display.max_columns = 50

pd.reset_option('display.min_rows')
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')


In [11]:
df = pd.DataFrame({
    'Name': ['Alice Smith', 'Bob Johnson', 'Charlie Lee', 'David Brown', 'Eva White', 'Frank Black', 'Grace Green'],
    'Age': [28, 34, 29, 42, 23, 36, 30],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston', 'Phoenix', 'Philadelphia', 'San Antonio'],
    'Salary': [70000, 80000, 65000, 120000, 50000, 90000, 75000]
    },
    index=['A', 'B', 'C', 'D', 'E', 'F', 'G'])
df

Unnamed: 0,Name,Age,City,Salary
A,Alice Smith,28,New York,70000
B,Bob Johnson,34,Los Angeles,80000
C,Charlie Lee,29,Chicago,65000
D,David Brown,42,Houston,120000
E,Eva White,23,Phoenix,50000
F,Frank Black,36,Philadelphia,90000
G,Grace Green,30,San Antonio,75000


In [12]:
df.loc['E']

Name      Eva White
Age              23
City        Phoenix
Salary        50000
Name: E, dtype: object

In [13]:
df.iloc[4]

Name      Eva White
Age              23
City        Phoenix
Salary        50000
Name: E, dtype: object

In [14]:
df.loc[['A', 'E']]

Unnamed: 0,Name,Age,City,Salary
A,Alice Smith,28,New York,70000
E,Eva White,23,Phoenix,50000


In [15]:
df[['Name', 'Salary']]

Unnamed: 0,Name,Salary
A,Alice Smith,70000
B,Bob Johnson,80000
C,Charlie Lee,65000
D,David Brown,120000
E,Eva White,50000
F,Frank Black,90000
G,Grace Green,75000


In [16]:
df['Salary'].loc['D']

np.int64(120000)

In [17]:
df[df['Salary'] == df['Salary'].max()]['Name']

D    David Brown
Name: Name, dtype: object

In [18]:
# df.at['row', 'column']
df.at['D', 'Age']

np.int64(42)

In [19]:
df.iat[5, 2]

'Philadelphia'

In [20]:
sample_df = df.sample(n=3)
sample_df

Unnamed: 0,Name,Age,City,Salary
A,Alice Smith,28,New York,70000
E,Eva White,23,Phoenix,50000
D,David Brown,42,Houston,120000


In [21]:
sample_df = df.sample(n=4, replace=True)
sample_df

Unnamed: 0,Name,Age,City,Salary
E,Eva White,23,Phoenix,50000
E,Eva White,23,Phoenix,50000
G,Grace Green,30,San Antonio,75000
B,Bob Johnson,34,Los Angeles,80000


In [22]:
df['Age'] < 30

A     True
B    False
C     True
D    False
E     True
F    False
G    False
Name: Age, dtype: bool

In [23]:
df[(df['Age'] < 30) & (df['Salary'] >= 70000)]

Unnamed: 0,Name,Age,City,Salary
A,Alice Smith,28,New York,70000


In [24]:
mask = df['City'] == 'New York'
df.loc[mask]

Unnamed: 0,Name,Age,City,Salary
A,Alice Smith,28,New York,70000


In [25]:
df.isin(['David Brown', 34])

Unnamed: 0,Name,Age,City,Salary
A,False,False,False,False
B,False,True,False,False
C,False,False,False,False
D,True,False,False,False
E,False,False,False,False
F,False,False,False,False
G,False,False,False,False


In [26]:
df[df.isin(['David Brown', 34]).any(axis=1)]

Unnamed: 0,Name,Age,City,Salary
B,Bob Johnson,34,Los Angeles,80000
D,David Brown,42,Houston,120000


In [27]:
df.where(df['Age'] < 30)

Unnamed: 0,Name,Age,City,Salary
A,Alice Smith,28.0,New York,70000.0
B,,,,
C,Charlie Lee,29.0,Chicago,65000.0
D,,,,
E,Eva White,23.0,Phoenix,50000.0
F,,,,
G,,,,


In [28]:
df[df.where(df['Age'] < 30).any(axis=1)]

Unnamed: 0,Name,Age,City,Salary
A,Alice Smith,28,New York,70000
C,Charlie Lee,29,Chicago,65000
E,Eva White,23,Phoenix,50000


In [29]:
df

Unnamed: 0,Name,Age,City,Salary
A,Alice Smith,28,New York,70000
B,Bob Johnson,34,Los Angeles,80000
C,Charlie Lee,29,Chicago,65000
D,David Brown,42,Houston,120000
E,Eva White,23,Phoenix,50000
F,Frank Black,36,Philadelphia,90000
G,Grace Green,30,San Antonio,75000


In [32]:
def salary_status_detector(salary):
    if int(salary) >= 80000:
        return 'High'
    elif int(salary) < 65000:
        return 'Low'
    else:
        return 'Medium'

df['Salary Status'] = df['Salary'].apply(salary_status_detector)

df

Unnamed: 0,Name,Age,City,Salary,Salary Status
A,Alice Smith,28,New York,70000,Medium
B,Bob Johnson,34,Los Angeles,80000,High
C,Charlie Lee,29,Chicago,65000,Medium
D,David Brown,42,Houston,120000,High
E,Eva White,23,Phoenix,50000,Low
F,Frank Black,36,Philadelphia,90000,High
G,Grace Green,30,San Antonio,75000,Medium


In [73]:
mapping = {
    'New York': 'New York', 'Los Angeles': 'California',
        'Chicago': 'Illinois', 'Houston': 'Texas',
           'Phoenix': 'Arizona', 'Philadelphia': 'Pennsylvania',
           'San Antonio': 'Texas'
}
df['State'] = df['City'].map(mapping)

df

Unnamed: 0,Name,Age,City,State,Salary,Salary Status
A,Alice Smith,28,New York,New York,70000,Medium
B,Bob Johnson,34,Los Angeles,California,80000,High
C,Charlie Lee,29,Chicago,Illinois,65000,Medium
D,David Brown,42,Houston,Texas,120000,High
E,Eva White,23,Phoenix,Arizona,50000,Low
F,Frank Black,36,Philadelphia,Pennsylvania,90000,High
G,Grace Green,30,San Antonio,Texas,75000,Medium


In [74]:
# To change feature's orders:
df = df[['Name', 'Age', 'City', 'State', 'Salary', 'Salary Status']]
df

Unnamed: 0,Name,Age,City,State,Salary,Salary Status
A,Alice Smith,28,New York,New York,70000,Medium
B,Bob Johnson,34,Los Angeles,California,80000,High
C,Charlie Lee,29,Chicago,Illinois,65000,Medium
D,David Brown,42,Houston,Texas,120000,High
E,Eva White,23,Phoenix,Arizona,50000,Low
F,Frank Black,36,Philadelphia,Pennsylvania,90000,High
G,Grace Green,30,San Antonio,Texas,75000,Medium


In [75]:
s = pd.Series(['cat', 'dog', np.nan, 'rabbit'])
s

0       cat
1       dog
2       NaN
3    rabbit
dtype: object

In [76]:
s.map('I am a {}'.format)

0       I am a cat
1       I am a dog
2       I am a nan
3    I am a rabbit
dtype: object

In [77]:
s.map('I am a {}'.format, na_action='ignore')

0       I am a cat
1       I am a dog
2              NaN
3    I am a rabbit
dtype: object

In [78]:
# To call multiple functions on rows or columns in a df, use .agg([functions]):
df1 = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50],
    'C': [100, 200, 300, 400, 500]
})
df1

Unnamed: 0,A,B,C
0,1,10,100
1,2,20,200
2,3,30,300
3,4,40,400
4,5,50,500


In [79]:
df1.agg(['sum', 'mean', 'max', 'min'])

Unnamed: 0,A,B,C
sum,15.0,150.0,1500.0
mean,3.0,30.0,300.0
max,5.0,50.0,500.0
min,1.0,10.0,100.0


In [80]:
df1.agg(['sum', 'mean', 'max', 'min'], axis=1)

Unnamed: 0,sum,mean,max,min
0,111.0,37.0,100.0,1.0
1,222.0,74.0,200.0,2.0
2,333.0,111.0,300.0,3.0
3,444.0,148.0,400.0,4.0
4,555.0,185.0,500.0,5.0


In [81]:
df

Unnamed: 0,Name,Age,City,State,Salary,Salary Status
A,Alice Smith,28,New York,New York,70000,Medium
B,Bob Johnson,34,Los Angeles,California,80000,High
C,Charlie Lee,29,Chicago,Illinois,65000,Medium
D,David Brown,42,Houston,Texas,120000,High
E,Eva White,23,Phoenix,Arizona,50000,Low
F,Frank Black,36,Philadelphia,Pennsylvania,90000,High
G,Grace Green,30,San Antonio,Texas,75000,Medium


In [82]:
df_salary_status = df['Salary'].agg(['mean', 'max', 'min']).round(2)
print(f'Salary Status:\n{df_salary_status}')

Salary Status:
mean     78571.43
max     120000.00
min      50000.00
Name: Salary, dtype: float64


In [83]:
df1

Unnamed: 0,A,B,C
0,1,10,100
1,2,20,200
2,3,30,300
3,4,40,400
4,5,50,500


In [84]:
# Apply different functions to different columns
df1.agg({
    'A': ['sum', 'mean'],
    'B': ['min', 'max'],
    'C': 'std'
})

Unnamed: 0,A,B,C
sum,15.0,,
mean,3.0,,
min,,10.0,
max,,50.0,
std,,,158.113883


Use `df.loc[:, something]` when you want to:
- 1. Select a subset of columns and possibly assign new columns later.
- 2. Select rows by label and columns at the same time.
- 3. Assign a new column safely after slicing or reordering (IMP).

This helps you avoid facing `SettingWithCopyWarning` warning (avoids `SettingWithCopyWarning` without creating a full copy).

In [3]:
# df.valuecount() returns a Series. To find a specific data's count, just use [index]:
s = pd.Series(['A', 'A', 'A', 'B', 'C', 'D'])
s.value_counts()

A    3
B    1
C    1
D    1
Name: count, dtype: int64

In [4]:
s.value_counts()['A']

np.int64(3)

The best way to reorder a df's columns, is using `df.loc[:, [new order]]`.

Alse, to avoid mistakes, use `df.columns.tolist()` to get the list of columns.

In [7]:
email_df = pd.read_csv('/home/mseifoori/Machine-Learning-Exercises/people-10000.csv')
email_df['First Name'] = email_df['First Name'].astype('str')
email_df['Last Name'] = email_df['Last Name'].astype('str')
email_df['Full Name'] = email_df['First Name'].str.cat(email_df['Last Name'], sep= ' ')

In [8]:
email_df.columns.tolist()

['Index',
 'User Id',
 'First Name',
 'Last Name',
 'Sex',
 'Email',
 'Phone',
 'Date of birth',
 'Job Title',
 'Full Name']

In [9]:
email_df = email_df.loc[:, ['Index', 'User Id', 'First Name', 'Last Name', 'Full Name', 'Sex', 'Email', 'Phone', 'Date of birth', 'Job Title']]

In [10]:
email_df

Unnamed: 0,Index,User Id,First Name,Last Name,Full Name,Sex,Email,Phone,Date of birth,Job Title
0,1,5f10e9D33fC5f2b,Sara,Mcguire,Sara Mcguire,Female,tsharp@example.net,(971)643-6089x9160,1921-08-17,"Editor, commissioning"
1,2,751cD1cbF77e005,Alisha,Hebert,Alisha Hebert,Male,vincentgarrett@example.net,+1-114-355-1841x78347,1969-06-28,Broadcast engineer
2,3,DcEFDB2D2e62bF9,Gwendolyn,Sheppard,Gwendolyn Sheppard,Male,mercadojonathan@example.com,9017807728,1915-09-25,Industrial buyer
3,4,C88661E02EEDA9e,Kristine,Mccann,Kristine Mccann,Female,lindsay55@example.com,+1-607-333-9911x59088,1978-07-27,Multimedia specialist
4,5,fafF1aBDebaB2a6,Bobby,Pittman,Bobby Pittman,Female,blevinsmorgan@example.com,3739847538,1989-11-17,Planning and development surveyor
...,...,...,...,...,...,...,...,...,...,...
9995,9996,D66F0e4EdFc35e6,Tina,Sherman,Tina Sherman,Male,bartlettcolleen@example.org,(455)476-4044x5755,1974-07-28,"Scientist, physiological"
9996,9997,c753d8B9F5b6054,Earl,Jennings,Earl Jennings,Female,andreabenton@example.com,009.056.6505,1932-08-20,Warehouse manager
9997,9998,Fb2c7daAdD82dAE,Ellen,Dominguez,Ellen Dominguez,Female,michaelayoder@example.net,409-428-4297x469,1966-06-06,Lawyer
9998,9999,34D88Ffc743Ca5B,Emma,Clark,Emma Clark,Male,pstrickland@example.com,849-868-8653,1907-05-09,Accounting technician


In [11]:
email_df['Phone'] = email_df['Phone'].astype('str')
us_numbers_plus1 = email_df['Phone'].str.startswith('+1').value_counts()[True]

print(f'Count of +1 US phone numbers in this dataset:\n{us_numbers_plus1}')

Count of +1 US phone numbers in this dataset:
1595
