In [1]:
import pandas as pd

In [109]:
data = {
    'Name':['Ali', 'Kamran', 'Javeria', 'Nadir', 'Qadeer'],
    'Age': [19,32,24,17,None],
    'Sex': ['male', 'male', None , 'female', 'male'],
    'City':['Lahore', 'Karachi', 'Islamabad', 'Karachi', 'Karachi']
}
df = pd.DataFrame(data)
print(df)

      Name   Age     Sex       City
0      Ali  19.0    male     Lahore
1   Kamran  32.0    male    Karachi
2  Javeria  24.0    None  Islamabad
3    Nadir  17.0  female    Karachi
4   Qadeer   NaN    male    Karachi


##### Column Selection
###### You can select a single column from a DataFrame by specifying the column name within double brackets.Multiple columns can be selected in a similar manner, creating a new DataFrame.

In [81]:
#Single Column Selection without column name
print(df['Name'])
#Single Column Selection with column name
print(df[['Name']])
#Get values of all rows of a column as numpy array
print(df['Name'].values)

0        Ali
1     Kamran
2    Javeria
3      Nadir
4     Qadeer
Name: Name, dtype: object
      Name
0      Ali
1   Kamran
2  Javeria
3    Nadir
4   Qadeer
['Ali' 'Kamran' 'Javeria' 'Nadir' 'Qadeer']


##### Accessing Rows:
###### You can access rows by their index using .iloc[] or by label using .loc[].

In [82]:
# access 3rd row by position
print(df.iloc[2])
# access 2nd row by label
print(df.iloc[1])

Name      Javeria
Age          24.0
Sex          None
City    Islamabad
Name: 2, dtype: object
Name     Kamran
Age        32.0
Sex        male
City    Karachi
Name: 1, dtype: object


In [35]:
# Select specific columns to slice dataframe / create small subset from whole dataset
print(df[['Name', 'Age']])
# Select specific rows
print(df[2:4])

      Name  Age
0      Ali   19
1   Kamran   32
2  Javeria   24
3    Nadir   17
      Name  Age     Sex       City
2  Javeria   24  female  Islamabad
3    Nadir   17    male    Karachi


In [83]:
# find unique elements
print(df['Sex'].unique())
print(df['Sex'].nunique())


['male' None 'female']
2


##### Conditional Filtering:
###### You can filter data in a DataFrame based on conditions using inequality operators. For instance, you can filter albums released after a certain year.

In [86]:
# Filter age column to get records of above 20 age
above_20 = df[df['Age'] > 20]
above_20

0    male
Name: Sex, dtype: object

##### DataFrame Attributes and Methods
###### DataFrames provide numerous attributes and methods for data manipulation and analysis, including:
    shape: Returns the dimensions (number of rows and columns) of the DataFrame.
    info(): Provides a summary of the DataFrame, including data types and non-null counts.
    describe(): Generates summary statistics for numerical columns.
    head(), tail(): Displays the first or last n rows of the DataFrame.
    mean(), sum(), min(), max(): Calculate summary statistics for columns.
    sort_values(): Sort the DataFrame by one or more columns.
    groupby(): Group data based on specific columns for aggregation.
    fillna(), drop(), rename(): Handle missing values, drop columns, or rename columns.
    apply(): Apply a function to each element, row, or column of the DataFrame.

In [110]:
df.info()
#check missing values
print('missing Values\n', df.isnull().sum())
# fill missing vlues of age by mean value
df['Age'] = df['Age'].fillna(df['Age'].mean())
# fill missing vlues of sex by mode value
df['Sex'] = df['Sex'].fillna(df['Sex'].mode()[0])
print('missing Values after cleansing\n', df.isnull().sum())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Name    5 non-null      object 
 1   Age     4 non-null      float64
 2   Sex     4 non-null      object 
 3   City    5 non-null      object 
dtypes: float64(1), object(3)
memory usage: 288.0+ bytes
missing Values
 Name    0
Age     1
Sex     1
City    0
dtype: int64
missing Values after cleansing
 Name    0
Age     0
Sex     0
City    0
dtype: int64


In [43]:
# check number of columns and rows in dataset
print('Dataset Dimensions (rows, cols):',df.shape)
# calculate mean, min and max of a column named 'Age'
print('Mean Age:',df['Age'].mean())
print('Minimun Age:',df['Age'].min())
print('Maximum Age:',df['Age'].max())
# sort dataset w.r.t column 'Age' in decending order
print('Before:\n', df)
df = df.sort_values(by=['Age'], ascending=False)
print('After:\n', df)
#print(df['Age'].sort_values(ascending=False))


Dataset Dimensions (rows, cols): (4, 4)
Mean Age: 23.0
Minimun Age: 17
Maximum Age: 32
Before:
       Name  Age     Sex       City
0      Ali   19    male     Lahore
1   Kamran   32    male    Karachi
2  Javeria   24  female  Islamabad
3    Nadir   17    male    Karachi
After:
       Name  Age     Sex       City
1   Kamran   32    male    Karachi
2  Javeria   24  female  Islamabad
0      Ali   19    male     Lahore
3    Nadir   17    male    Karachi


In [53]:
print(df.groupby(['Sex']).count())
print(df.groupby('Sex')[['City']].count())

        Name  Age  City
Sex                    
female     1    1     1
male       3    3     3
        City
Sex         
female     1
male       3


In [66]:
# get number of males living in Karachi
print('Total males:', df['Sex'].value_counts()['male'])
print('Males in Karachi:', 
      df.query("Sex=='male' & City=='Karachi'")["City"].count())


Total males: 3
Males in Karachi: 2
