### Importing libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder,MinMaxScaler,StandardScaler

### DataFrame Creation

In [2]:
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eva', 'Frank', 'Alice'],
    'Age': [25, 30, 35, np.nan, 40, 28, 25],
    'Salary': [50000, 60000, 70000, 80000, 90000, 55000, 50000],
    'Department': ['HR', 'IT', 'IT', 'HR', 'Finance', 'IT', 'HR'],
    'Priority': ['Low', 'Medium', 'High', 'Medium', 'High', 'Low', 'Low']
}
df = pd.DataFrame(data)
df

Unnamed: 0,Name,Age,Salary,Department,Priority
0,Alice,25.0,50000,HR,Low
1,Bob,30.0,60000,IT,Medium
2,Charlie,35.0,70000,IT,High
3,David,,80000,HR,Medium
4,Eva,40.0,90000,Finance,High
5,Frank,28.0,55000,IT,Low
6,Alice,25.0,50000,HR,Low


### Basic Statistics

In [3]:
df.describe()

Unnamed: 0,Age,Salary
count,6.0,7.0
mean,30.5,65000.0
std,5.958188,15545.631755
min,25.0,50000.0
25%,25.75,52500.0
50%,29.0,60000.0
75%,33.75,75000.0
max,40.0,90000.0


In [4]:
df['Age'].value_counts()

Age
25.0    2
30.0    1
35.0    1
40.0    1
28.0    1
Name: count, dtype: int64

In [5]:
df['Salary'].mean()
df['Salary'].median()
df['Salary'].std()

15545.631755148024

### Taking only numeric columns

In [10]:
numeric_cols = df.select_dtypes(include=np.number).columns
df_numeric = df[numeric_cols]

### Missing values handling

In [8]:
df.isna().sum()

Name          0
Age           1
Salary        0
Department    0
Priority      0
dtype: int64

In [12]:
df_numeric.fillna(df_numeric.mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_numeric.fillna(df_numeric.mean(), inplace=True)


In [14]:
df_numeric.isna().sum()

Age       0
Salary    0
dtype: int64

In [15]:
df['Age'].fillna(df['Age'].mean(),inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(),inplace=True)


### Duplicate Check and Handling

In [16]:
df.duplicated().sum()

np.int64(1)

In [17]:
df.drop_duplicates(keep='first',inplace=True)

### Column drop and rename

In [18]:
df.drop(columns='Priority',axis=1,inplace=True)


In [19]:
df.rename(columns={'Salary':'Income'},inplace=True)

### Encoding categorical values

In [20]:
le = LabelEncoder()
df['Dept_encoded'] = le.fit_transform(df['Department'])

In [21]:
df.head()

Unnamed: 0,Name,Age,Income,Department,Dept_encoded
0,Alice,25.0,50000,HR,1
1,Bob,30.0,60000,IT,2
2,Charlie,35.0,70000,IT,2
3,David,30.5,80000,HR,1
4,Eva,40.0,90000,Finance,0


In [22]:
df_encoded = pd.get_dummies(df,columns=['Department'])
df_encoded

Unnamed: 0,Name,Age,Income,Dept_encoded,Department_Finance,Department_HR,Department_IT
0,Alice,25.0,50000,1,False,True,False
1,Bob,30.0,60000,2,False,False,True
2,Charlie,35.0,70000,2,False,False,True
3,David,30.5,80000,1,False,True,False
4,Eva,40.0,90000,0,True,False,False
5,Frank,28.0,55000,2,False,False,True


### Normalization / Scaling

In [23]:
scaler_std = StandardScaler()
df['Income_scaled'] = scaler_std.fit_transform(df[['Income']])

scaler_minmax  = MinMaxScaler()
df['Income_minmax'] = scaler_minmax.fit_transform(df[['Income']])

In [24]:
df

Unnamed: 0,Name,Age,Income,Department,Dept_encoded,Income_scaled,Income_minmax
0,Alice,25.0,50000,HR,1,-1.243933,0.0
1,Bob,30.0,60000,IT,2,-0.533114,0.25
2,Charlie,35.0,70000,IT,2,0.177705,0.5
3,David,30.5,80000,HR,1,0.888523,0.75
4,Eva,40.0,90000,Finance,0,1.599342,1.0
5,Frank,28.0,55000,IT,2,-0.888523,0.125


### Working with loc & iloc

In [25]:
# loc - label based
df.loc[0,'Name']

'Alice'

In [26]:
df.loc[:,['Name','Age']]

Unnamed: 0,Name,Age
0,Alice,25.0
1,Bob,30.0
2,Charlie,35.0
3,David,30.5
4,Eva,40.0
5,Frank,28.0


In [27]:
df.loc[df['Income']>60000]

Unnamed: 0,Name,Age,Income,Department,Dept_encoded,Income_scaled,Income_minmax
2,Charlie,35.0,70000,IT,2,0.177705,0.5
3,David,30.5,80000,HR,1,0.888523,0.75
4,Eva,40.0,90000,Finance,0,1.599342,1.0


In [28]:
# iloc - position based
df.iloc[0,1]

np.float64(25.0)

In [29]:
df.iloc[:,0:2]

Unnamed: 0,Name,Age
0,Alice,25.0
1,Bob,30.0
2,Charlie,35.0
3,David,30.5
4,Eva,40.0
5,Frank,28.0


### Grouping + Aggregation + Transform

In [30]:
df.groupby('Dept_encoded')['Income'].mean()

Dept_encoded
0    90000.000000
1    65000.000000
2    61666.666667
Name: Income, dtype: float64

In [31]:
# multiple aggregation
df.groupby('Dept_encoded').agg({'Income':['mean','max'],'Age':'mean'})

Unnamed: 0_level_0,Income,Income,Age
Unnamed: 0_level_1,mean,max,mean
Dept_encoded,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,90000.0,90000,40.0
1,65000.0,80000,27.75
2,61666.666667,70000,31.0


### Advanced Indexing

In [33]:
# boolean indexing
df[(df['Age']>30) & (df['Income']>60000)]

Unnamed: 0,Name,Age,Income,Department,Dept_encoded,Income_scaled,Income_minmax
2,Charlie,35.0,70000,IT,2,0.177705,0.5
3,David,30.5,80000,HR,1,0.888523,0.75
4,Eva,40.0,90000,Finance,0,1.599342,1.0


In [34]:
# using isin
df[df['Dept_encoded'].isin([0,2])] #select dept 0 and 2

Unnamed: 0,Name,Age,Income,Department,Dept_encoded,Income_scaled,Income_minmax
1,Bob,30.0,60000,IT,2,-0.533114,0.25
2,Charlie,35.0,70000,IT,2,0.177705,0.5
4,Eva,40.0,90000,Finance,0,1.599342,1.0
5,Frank,28.0,55000,IT,2,-0.888523,0.125


### Feature Engineering

In [36]:
# new feature : Age/Income ratio
df['Age_Income_Ratio'] = df['Age'] / df['Income']
df['Age_Income_Ratio']

0    0.000500
1    0.000500
2    0.000500
3    0.000381
4    0.000444
5    0.000509
Name: Age_Income_Ratio, dtype: float64

In [37]:
# Binning / Categorization
df['Age_group'] = pd.cut(df['Age'],bins=[0,30,40,50],labels=['Young','Mid','Old'])

In [38]:
df.head()

Unnamed: 0,Name,Age,Income,Department,Dept_encoded,Income_scaled,Income_minmax,Age_Income_Ratio,Age_group
0,Alice,25.0,50000,HR,1,-1.243933,0.0,0.0005,Young
1,Bob,30.0,60000,IT,2,-0.533114,0.25,0.0005,Young
2,Charlie,35.0,70000,IT,2,0.177705,0.5,0.0005,Mid
3,David,30.5,80000,HR,1,0.888523,0.75,0.000381,Mid
4,Eva,40.0,90000,Finance,0,1.599342,1.0,0.000444,Mid
