# Missing Data (or) NaN values    and   Group-By

In [3]:
import pandas as pd
import numpy as np

employee_details = [
        { "idno": 101, "name" : "Ravi",  "salary": 125000.00 },
        { "idno": 102, "name" : np.nan,  "salary": np.nan },
        { "idno": 103, "name" : np.nan,  "salary": 150000.00 },
        { "idno": 104, "name" : "Krishna",  "salary": 105000.00 },
        { "idno": 105, "name" : "Prasad",  "salary": np.nan },
    ]
employee_details

[{'idno': 101, 'name': 'Ravi', 'salary': 125000.0},
 {'idno': 102, 'name': nan, 'salary': nan},
 {'idno': 103, 'name': nan, 'salary': 150000.0},
 {'idno': 104, 'name': 'Krishna', 'salary': 105000.0},
 {'idno': 105, 'name': 'Prasad', 'salary': nan}]

In [4]:
# Converting into Pandas DataFrame Object

df = pd.DataFrame(employee_details)

df

Unnamed: 0,idno,name,salary
0,101,Ravi,125000.0
1,102,,
2,103,,150000.0
3,104,Krishna,105000.0
4,105,Prasad,


### DataFrame.dropna() — What does it do?

* Purpose: Remove rows or columns that contain missing values (NaN) from a DataFrame.

* Helps clean your data by getting rid of incomplete entries.

#### Basic syntax:

df.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

#### Parameters explained:

| Parameter | Description                                                                                                   |
| --------- | ------------------------------------------------------------------------------------------------------------- |
| `axis`    | 0 or `'index'` (default): drop rows with missing values<br>1 or `'columns'`: drop columns with missing values |
| `how`     | `'any'` (default): drop if **any** value is `NaN`<br>`'all'`: drop only if **all** values are `NaN`           |
| `thresh`  | Require that many **non-NA values** to keep the row/column                                                    |
| `subset`  | Limit checking to specific columns (if dropping rows) or rows (if dropping columns)                           |
| `inplace` | If `True`, do operation inplace and return `None`. Default is `False` (returns a new DataFrame)               |


In [5]:
# Example's on dropna function.

df

Unnamed: 0,idno,name,salary
0,101,Ravi,125000.0
1,102,,
2,103,,150000.0
3,104,Krishna,105000.0
4,105,Prasad,


In [7]:
df.dropna()  # Remmove all rows where value is Nan.

Unnamed: 0,idno,name,salary
0,101,Ravi,125000.0
3,104,Krishna,105000.0


In [11]:
df.dropna(axis=1)

Unnamed: 0,idno
0,101
1,102
2,103
3,104
4,105


In [17]:
df.dropna(thresh=2)  # This will drop if you find >=2 Nan's in a row.

Unnamed: 0,idno,name,salary
0,101,Ravi,125000.0
2,103,,150000.0
3,104,Krishna,105000.0
4,105,Prasad,


### DataFrame.fillna() — What does it do?

* Used to fill missing values (NaN) in a DataFrame with a specified value or method.

* Helps replace missing data to avoid errors in analysis or modeling.

#### Basic syntax:

df.fillna(value=None, method=None, axis=None, inplace=False, limit=None)

#### Parameters explained:

| Parameter | Description                                                                                                            |
| --------- | ---------------------------------------------------------------------------------------------------------------------- |
| `value`   | Scalar, dict, Series, or DataFrame to fill `NaN` with. Example: `0` or `{'A': 1, 'B': 2}`                              |
| `method`  | `'ffill'` or `'pad'` to forward fill (use previous value), `'bfill'` or `'backfill'` to backward fill (use next value) |
| `axis`    | 0 or `'index'` (default): fill along rows<br>1 or `'columns'`: fill along columns                                      |
| `inplace` | If True, modify DataFrame in place (no return). Default False (returns a new DataFrame)                                |
| `limit`   | Max number of consecutive NaNs to fill                                                                                 |


In [18]:
df

Unnamed: 0,idno,name,salary
0,101,Ravi,125000.0
1,102,,
2,103,,150000.0
3,104,Krishna,105000.0
4,105,Prasad,


In [19]:
df.fillna(value='Not Given By Employee')

Unnamed: 0,idno,name,salary
0,101,Ravi,125000.0
1,102,Not Given By Employee,Not Given By Employee
2,103,Not Given By Employee,150000.0
3,104,Krishna,105000.0
4,105,Prasad,Not Given By Employee


In [20]:
df.fillna(value='Not Given By Employee',axis=1)

Unnamed: 0,idno,name,salary
0,101,Ravi,125000.0
1,102,Not Given By Employee,Not Given By Employee
2,103,Not Given By Employee,150000.0
3,104,Krishna,105000.0
4,105,Prasad,Not Given By Employee


# Group By

### What is GroupBy in Pandas?
    * GroupBy is a method in pandas used to split data into groups based on some criteria, apply a function to each group, and then combine the results.

    * It follows the "Split-Apply-Combine" pattern:

        ** Split the data into groups (based on column values).

        ** Apply a function (like aggregation, transformation, or filtering) to each group.

        ** Combine the results back into a DataFrame or Series.

### Why use GroupBy?

    * To summarize, aggregate, or transform data by categories.

    * Useful for analyzing data by groups — e.g., sales per company, average scores by class, total revenue by region.

##### DataFrame.groupby(by=None, axis=<no_default>, level=None, as_index=True, sort=True, group_keys=True, observed=<no_default>, dropna=True)


In [21]:
five_company_employees_info = [
        {'employee_id': 1, 'name': 'Fdlks Fqowte', 'sales': 8383, 'company': 'DataWorks'},
        {'employee_id': 2, 'name': 'Mrmqsl Wfkrlxav', 'sales': 2514, 'company': 'NextGen'},
        {'employee_id': 3, 'name': 'Umgoak Nkzvru', 'sales': 7262, 'company': 'SalesPro'},
        {'employee_id': 4, 'name': 'Dqqoj Lxyhxc', 'sales': 9114, 'company': 'TechCorp'},
        {'employee_id': 5, 'name': 'Fllrsb Ursyhlb', 'sales': 7886, 'company': 'InnovateX'},
        {'employee_id': 6, 'name': 'Ueqre Rxxapw', 'sales': 1022, 'company': 'InnovateX'},
        {'employee_id': 7, 'name': 'Lmbyn Xznve', 'sales': 6806, 'company': 'NextGen'},
        {'employee_id': 8, 'name': 'Wxwsb Rvxpezf', 'sales': 9369, 'company': 'SalesPro'},
        {'employee_id': 9, 'name': 'Wfboj Zthkwhs', 'sales': 1931, 'company': 'NextGen'},
        {'employee_id': 10, 'name': 'Wvzpkd Tfpmifv', 'sales': 3073, 'company': 'DataWorks'},
        {'employee_id': 11, 'name': 'Uvlipz Pkbwej', 'sales': 1615, 'company': 'DataWorks'},
        {'employee_id': 12, 'name': 'Yozfbx Ywrkhka', 'sales': 4135, 'company': 'NextGen'},
        {'employee_id': 13, 'name': 'Gqhmji Lgqpn', 'sales': 2274, 'company': 'TechCorp'},
        {'employee_id': 14, 'name': 'Gzsrzt Hznxje', 'sales': 5219, 'company': 'SalesPro'},
        {'employee_id': 15, 'name': 'Lodlnt Mvpvo', 'sales': 6571, 'company': 'TechCorp'},
        {'employee_id': 16, 'name': 'Dvwklz Zplqol', 'sales': 3906, 'company': 'InnovateX'},
        {'employee_id': 17, 'name': 'Jltvnx Kkvmex', 'sales': 4207, 'company': 'InnovateX'},
        {'employee_id': 18, 'name': 'Qbhtxf Kyhdlf', 'sales': 1543, 'company': 'TechCorp'},
        {'employee_id': 19, 'name': 'Gxiwyx Vqxvmjwl', 'sales': 9648, 'company': 'NextGen'},
        {'employee_id': 20, 'name': 'Zddry Rwooqk', 'sales': 5616, 'company': 'SalesPro'},
        {'employee_id': 21, 'name': 'Yvktzx Rnpazk', 'sales': 5712, 'company': 'TechCorp'},
        {'employee_id': 22, 'name': 'Ompvpt Dcevk', 'sales': 4049, 'company': 'TechCorp'},
        {'employee_id': 23, 'name': 'Cddztx Xghhn', 'sales': 4404, 'company': 'SalesPro'},
        {'employee_id': 24, 'name': 'Xhxyo Whhfov', 'sales': 4456, 'company': 'DataWorks'},
        {'employee_id': 25, 'name': 'Zsqnup Woixt', 'sales': 3397, 'company': 'InnovateX'}
    ]

In [22]:
# Converting to DataFrame Object

import pandas as pd
import numpy as np

df = pd.DataFrame(five_company_employees_info)

df

Unnamed: 0,employee_id,name,sales,company
0,1,Fdlks Fqowte,8383,DataWorks
1,2,Mrmqsl Wfkrlxav,2514,NextGen
2,3,Umgoak Nkzvru,7262,SalesPro
3,4,Dqqoj Lxyhxc,9114,TechCorp
4,5,Fllrsb Ursyhlb,7886,InnovateX
5,6,Ueqre Rxxapw,1022,InnovateX
6,7,Lmbyn Xznve,6806,NextGen
7,8,Wxwsb Rvxpezf,9369,SalesPro
8,9,Wfboj Zthkwhs,1931,NextGen
9,10,Wvzpkd Tfpmifv,3073,DataWorks


In [24]:
company = df.groupby(by='company')

company   # this will print DataFrameGroupBy object.

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x10d742090>

In [27]:
company.sum()

Unnamed: 0_level_0,employee_id,name,sales
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DataWorks,46,Fdlks FqowteWvzpkd TfpmifvUvlipz PkbwejXhxyo W...,17527
InnovateX,69,Fllrsb UrsyhlbUeqre RxxapwDvwklz ZplqolJltvnx ...,20418
NextGen,49,Mrmqsl WfkrlxavLmbyn XznveWfboj ZthkwhsYozfbx ...,25034
SalesPro,68,Umgoak NkzvruWxwsb RvxpezfGzsrzt HznxjeZddry R...,31870
TechCorp,93,Dqqoj LxyhxcGqhmji LgqpnLodlnt MvpvoQbhtxf Kyh...,29263


In [29]:
company.max()

Unnamed: 0_level_0,employee_id,name,sales
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DataWorks,24,Xhxyo Whhfov,8383
InnovateX,25,Zsqnup Woixt,7886
NextGen,19,Yozfbx Ywrkhka,9648
SalesPro,23,Zddry Rwooqk,9369
TechCorp,22,Yvktzx Rnpazk,9114


In [30]:
company.min()

Unnamed: 0_level_0,employee_id,name,sales
company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
DataWorks,1,Fdlks Fqowte,1615
InnovateX,5,Dvwklz Zplqol,1022
NextGen,2,Gxiwyx Vqxvmjwl,1931
SalesPro,3,Cddztx Xghhn,4404
TechCorp,4,Dqqoj Lxyhxc,1543


In [32]:
company.sum()['sales']

company
DataWorks    17527
InnovateX    20418
NextGen      25034
SalesPro     31870
TechCorp     29263
Name: sales, dtype: int64

In [33]:
company.describe()

Unnamed: 0_level_0,employee_id,employee_id,employee_id,employee_id,employee_id,employee_id,employee_id,employee_id,sales,sales,sales,sales,sales,sales,sales,sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
company,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
DataWorks,4.0,11.5,9.469248,1.0,7.75,10.5,14.25,24.0,4.0,4381.75,2908.793951,1615.0,2708.5,3764.5,5437.75,8383.0
InnovateX,5.0,13.8,8.348653,5.0,6.0,16.0,17.0,25.0,5.0,4083.6,2467.277305,1022.0,3397.0,3906.0,4207.0,7886.0
NextGen,5.0,9.8,6.300794,2.0,7.0,9.0,12.0,19.0,5.0,5006.8,3209.847769,1931.0,2514.0,4135.0,6806.0,9648.0
SalesPro,5.0,13.6,8.264381,3.0,8.0,14.0,20.0,23.0,5.0,6374.0,1971.551293,4404.0,5219.0,5616.0,7262.0,9369.0
TechCorp,6.0,15.5,6.595453,4.0,13.5,16.5,20.25,22.0,6.0,4877.166667,2831.811393,1543.0,2717.75,4880.5,6356.25,9114.0


In [34]:
company.describe().transpose()

Unnamed: 0,company,DataWorks,InnovateX,NextGen,SalesPro,TechCorp
employee_id,count,4.0,5.0,5.0,5.0,6.0
employee_id,mean,11.5,13.8,9.8,13.6,15.5
employee_id,std,9.469248,8.348653,6.300794,8.264381,6.595453
employee_id,min,1.0,5.0,2.0,3.0,4.0
employee_id,25%,7.75,6.0,7.0,8.0,13.5
employee_id,50%,10.5,16.0,9.0,14.0,16.5
employee_id,75%,14.25,17.0,12.0,20.0,20.25
employee_id,max,24.0,25.0,19.0,23.0,22.0
sales,count,4.0,5.0,5.0,5.0,6.0
sales,mean,4381.75,4083.6,5006.8,6374.0,4877.166667
