# sorted

In [1]:
import pandas as pd

# Create the DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [24, 27, 22, 32],
    'Salary': [70000, 80000, 120000, 90000]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Sort by a single column (Age)
sorted_df_age = df.sort_values("Age")
print("\nDataFrame sorted by Age:")
print(sorted_df_age)

# Sort by multiple columns (Age ascending, Salary descending)
sorted_df_age_salary = df.sort_values(by=["Age", "Salary"], ascending=[True, False])
print("\nDataFrame sorted by Age (ascending) and Salary (descending):")
print(sorted_df_age_salary)


Original DataFrame:
      Name  Age  Salary
0    Alice   24   70000
1      Bob   27   80000
2  Charlie   22  120000
3    David   32   90000

DataFrame sorted by Age:
      Name  Age  Salary
2  Charlie   22  120000
0    Alice   24   70000
1      Bob   27   80000
3    David   32   90000

DataFrame sorted by Age (ascending) and Salary (descending):
      Name  Age  Salary
2  Charlie   22  120000
0    Alice   24   70000
1      Bob   27   80000
3    David   32   90000


# Astype

In [2]:
import pandas as pd

# Create the DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': ['24', '27', '22', '32'],  # Age is stored as strings
    'Salary': [70000, 80000, 120000, 90000]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)
print("\nData types of columns:")
print(df.dtypes)

# Change the Age column to int
df['Age'] = df['Age'].astype(int)
print("\nDataFrame after changing Age to int:")
print(df)
print("\nData types of columns after change:")
print(df.dtypes)

# Change the Salary column to float
df['Salary'] = df['Salary'].astype(float)
print("\nDataFrame after changing Age to int and Salary to float:")
print(df)
print("\nData types of columns after changes:")
print(df.dtypes)


Original DataFrame:
      Name Age  Salary
0    Alice  24   70000
1      Bob  27   80000
2  Charlie  22  120000
3    David  32   90000

Data types of columns:
Name      object
Age       object
Salary     int64
dtype: object

DataFrame after changing Age to int:
      Name  Age  Salary
0    Alice   24   70000
1      Bob   27   80000
2  Charlie   22  120000
3    David   32   90000

Data types of columns after change:
Name      object
Age        int32
Salary     int64
dtype: object

DataFrame after changing Age to int and Salary to float:
      Name  Age    Salary
0    Alice   24   70000.0
1      Bob   27   80000.0
2  Charlie   22  120000.0
3    David   32   90000.0

Data types of columns after changes:
Name       object
Age         int32
Salary    float64
dtype: object


# String Methods

In [3]:
import pandas as pd

# Create the DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [24, 27, 22, 32],
    'City': ['New York', 'Los Angeles', 'Chicago', 'Houston']
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Using isin() to filter the DataFrame
filtered_df = df[df['City'].isin(['New York', 'Chicago'])]
print("\nFiltered DataFrame (City is New York or Chicago):")
print(filtered_df)

# Using string methods on the Name column
df['Name_Lower'] = df['Name'].str.lower()
print("\nDataFrame with lowercased names:")
print(df)

df['Name_Upper'] = df['Name'].str.upper()
print("\nDataFrame with uppercased names:")
print(df)

df['Name_Capitalized'] = df['Name'].str.capitalize()
print("\nDataFrame with capitalized names:")
print(df)

df['Name_Stripped'] = df['Name'].str.strip()
print("\nDataFrame with stripped names:")
print(df)


Original DataFrame:
      Name  Age         City
0    Alice   24     New York
1      Bob   27  Los Angeles
2  Charlie   22      Chicago
3    David   32      Houston

Filtered DataFrame (City is New York or Chicago):
      Name  Age      City
0    Alice   24  New York
2  Charlie   22   Chicago

DataFrame with lowercased names:
      Name  Age         City Name_Lower
0    Alice   24     New York      alice
1      Bob   27  Los Angeles        bob
2  Charlie   22      Chicago    charlie
3    David   32      Houston      david

DataFrame with uppercased names:
      Name  Age         City Name_Lower Name_Upper
0    Alice   24     New York      alice      ALICE
1      Bob   27  Los Angeles        bob        BOB
2  Charlie   22      Chicago    charlie    CHARLIE
3    David   32      Houston      david      DAVID

DataFrame with capitalized names:
      Name  Age         City Name_Lower Name_Upper Name_Capitalized
0    Alice   24     New York      alice      ALICE            Alice
1      Bob  

# nsmallest,nlargest,unique,nunique

In [4]:
import pandas as pd

# Create the DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [24, 27, 22, 32, 29],
    'Salary': [70000, 80000, 120000, 90000, 100000]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Using nlargest() to get the 3 rows with the largest Salary
largest_3_salaries = df.nlargest(3, 'Salary')
print("\n3 rows with the largest Salary:")
print(largest_3_salaries)

# Using nsmallest() to get the 3 rows with the smallest Age
smallest_3_ages = df.nsmallest(3, 'Age')
print("\n3 rows with the smallest Age:")
print(smallest_3_ages)

# Using unique() to get the unique values in the Name column
unique_names = df['Name'].unique()
print("\nUnique values in the Name column:")
print(unique_names)

# Using nunique() to get the number of unique values in the Name column
nunique_names = df['Name'].nunique()
print("\nNumber of unique values in the Name column:")
print(nunique_names)


Original DataFrame:
      Name  Age  Salary
0    Alice   24   70000
1      Bob   27   80000
2  Charlie   22  120000
3    David   32   90000
4      Eve   29  100000

3 rows with the largest Salary:
      Name  Age  Salary
2  Charlie   22  120000
4      Eve   29  100000
3    David   32   90000

3 rows with the smallest Age:
      Name  Age  Salary
2  Charlie   22  120000
0    Alice   24   70000
1      Bob   27   80000

Unique values in the Name column:
['Alice' 'Bob' 'Charlie' 'David' 'Eve']

Number of unique values in the Name column:
5


# Apply vs Applymap vs Map

In [5]:
import pandas as pd

# Create the DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [24, 27, 22, 32],
    'Salary': [70000, 80000, 120000, 90000]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Using apply() to categorize ages
def categorize_age(age):
    if age < 25:
        return 'Young'
    elif 25 <= age < 30:
        return 'Mid'
    else:
        return 'Old'

df['Age_Category'] = df['Age'].apply(categorize_age)
print("\nDataFrame with Age Category:")
print(df)

# Using applymap() to convert numeric values to strings
df_str = df.applymap(lambda x: str(x) if isinstance(x, (int, float)) else x)
print("\nDataFrame with numeric values converted to strings:")
print(df_str)

# Using map() to get abbreviated names
name_mapping = {
    'Alice': 'A',
    'Bob': 'B',
    'Charlie': 'C',
    'David': 'D'
}

df['Name_Abbrev'] = df['Name'].map(name_mapping)
print("\nDataFrame with Abbreviated Names:")
print(df)


Original DataFrame:
      Name  Age  Salary
0    Alice   24   70000
1      Bob   27   80000
2  Charlie   22  120000
3    David   32   90000

DataFrame with Age Category:
      Name  Age  Salary Age_Category
0    Alice   24   70000        Young
1      Bob   27   80000          Mid
2  Charlie   22  120000        Young
3    David   32   90000          Old

DataFrame with numeric values converted to strings:
      Name Age  Salary Age_Category
0    Alice  24   70000        Young
1      Bob  27   80000          Mid
2  Charlie  22  120000        Young
3    David  32   90000          Old

DataFrame with Abbreviated Names:
      Name  Age  Salary Age_Category Name_Abbrev
0    Alice   24   70000        Young           A
1      Bob   27   80000          Mid           B
2  Charlie   22  120000        Young           C
3    David   32   90000          Old           D


# Duplicates

In [7]:
import pandas as pd

# Create the DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Alice'],
    'Age': [24, 27, 22, 32, 24],
    'Salary': [70000, 80000, 120000, 90000, 70000]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Using duplicated().any() to check for duplicates
has_duplicates = df.duplicated().any()
print("\nAre there any duplicates in the DataFrame?")
print(has_duplicates)

# Using drop_duplicates() to remove duplicates
df_no_duplicates = df.drop_duplicates()
print("\nDataFrame after removing duplicates:")
print(df_no_duplicates)

# Removing duplicates based on specific columns
df_no_duplicates_name_age = df.drop_duplicates(subset=['Name', 'Age'])
print("\nDataFrame after removing duplicates based on Name and Age:")
print(df_no_duplicates_name_age)

# Keeping the last occurrence
df_no_duplicates_keep_last = df.drop_duplicates(keep='last')
print("\nDataFrame after removing duplicates, keeping the last occurrence:")
print(df_no_duplicates_keep_last)


Original DataFrame:
      Name  Age  Salary
0    Alice   24   70000
1      Bob   27   80000
2  Charlie   22  120000
3    David   32   90000
4    Alice   24   70000

Are there any duplicates in the DataFrame?
True

DataFrame after removing duplicates:
      Name  Age  Salary
0    Alice   24   70000
1      Bob   27   80000
2  Charlie   22  120000
3    David   32   90000

DataFrame after removing duplicates based on Name and Age:
      Name  Age  Salary
0    Alice   24   70000
1      Bob   27   80000
2  Charlie   22  120000
3    David   32   90000

DataFrame after removing duplicates, keeping the last occurrence:
      Name  Age  Salary
1      Bob   27   80000
2  Charlie   22  120000
3    David   32   90000
4    Alice   24   70000
