In [1]:
import pandas as pd
import numpy as np
pd.__version__

'2.2.3'

In [2]:
data = {
    "Name": ["Alice", "Bob", "Cathy", "Dan"], 
    "Age": [25, 30, 22, 35], 
    "Score": [88, 92, 79, 85]
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Score
0,Alice,25,88
1,Bob,30,92
2,Cathy,22,79
3,Dan,35,85


In [6]:
df.head()

Unnamed: 0,Name,Age,Score
0,Alice,25,88
1,Bob,30,92
2,Cathy,22,79
3,Dan,35,85


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Age     4 non-null      int64 
 2   Score   4 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 228.0+ bytes


In [8]:
df.describe()

Unnamed: 0,Age,Score
count,4.0,4.0
mean,28.0,86.0
std,5.715476,5.477226
min,22.0,79.0
25%,24.25,83.5
50%,27.5,86.5
75%,31.25,89.0
max,35.0,92.0


In [9]:
df.shape

(4, 3)

In [10]:
df.columns

Index(['Name', 'Age', 'Score'], dtype='object')

In [11]:
df["Age"]

0    25
1    30
2    22
3    35
Name: Age, dtype: int64

In [12]:
df[["Name", "Score"]]

Unnamed: 0,Name,Score
0,Alice,88
1,Bob,92
2,Cathy,79
3,Dan,85


In [16]:
df.loc[0]

Name     Alice
Age         25
Score       88
Name: 0, dtype: object

In [14]:
df.iloc[1:3]

Unnamed: 0,Name,Age,Score
1,Bob,30,92
2,Cathy,22,79


In [17]:
df[df["Age"] > 25]

Unnamed: 0,Name,Age,Score
1,Bob,30,92
3,Dan,35,85


In [18]:
df[(df["Age"] > 25) & (df["Score"] > 85)] #multiple conditions

Unnamed: 0,Name,Age,Score
1,Bob,30,92


In [24]:
df["Passed"] = df["Score"] > 80
df.loc[df["Name"] == "Cathy", "Score"] = 90
df

Unnamed: 0,Name,Age,Score,Passes,Passed
0,Alice,25,88,True,True
1,Bob,30,92,True,True
2,Cathy,22,90,False,True
3,Dan,35,85,True,True


In [26]:
df.loc[2, "Age"] = np.nan
df.isna().sum()
df["Age"].fillna(df["Age"].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Age"].fillna(df["Age"].mean(), inplace=True)


In [31]:
df = pd.DataFrame({
    "Name": ["Alice", "Bob", "Cathy"],
    "Age": [25, 30, 22], 
    "Score": [88, 92, 79]
})

df.to_csv("students.csv", index=False)
data = pd.read_csv("students.csv")
print(data)

    Name  Age  Score
0  Alice   25     88
1    Bob   30     92
2  Cathy   22     79


In [34]:
df["Score"].mean()
df.groupby("Passed")["Score"].mean()

KeyError: 'Passed'

In [35]:
data = {
    "Name": ["Alice", "Bob", "Cathy", "Dan", "Eli"],
    "Age": [25, np.nan, 22, 35, 28], 
    "Score": [88, 92, 79, 85, 95]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Score
0,Alice,25.0,88
1,Bob,,92
2,Cathy,22.0,79
3,Dan,35.0,85
4,Eli,28.0,95


In [38]:
high_scorers = df[df["Score"] > 85]
high_scorers

Unnamed: 0,Name,Age,Score
0,Alice,25.0,88
1,Bob,,92
4,Eli,28.0,95


In [39]:
df["Grade"] = np.where(df["Score"] >= 90, "A", "B")
df

Unnamed: 0,Name,Age,Score,Grade
0,Alice,25.0,88,B
1,Bob,,92,A
2,Cathy,22.0,79,B
3,Dan,35.0,85,B
4,Eli,28.0,95,A


In [40]:
df.isna().sum()

Name     0
Age      1
Score    0
Grade    0
dtype: int64

In [42]:
df["Age"] = df["Age"].fillna(df["Age"].mean())

In [43]:
df.to_csv("cleaned_students.csv", index=False)

In [44]:
cleaned = pd.read_csv("cleaned_students.csv")
cleaned

Unnamed: 0,Name,Age,Score,Grade
0,Alice,25.0,88,B
1,Bob,27.5,92,A
2,Cathy,22.0,79,B
3,Dan,35.0,85,B
4,Eli,28.0,95,A
