In [11]:
import pandas as pd
import numpy as np

In [13]:
# Generating basic DataFrame
data = np.arange(1, 21)
print("1d array:", data)

reshaped_data = data.reshape(5, 4)
print("\n 2d array:", reshaped_data)

df = pd.DataFrame(reshaped_data, columns=['A', 'B', 'C', 'D'])
print("\n DataFrame:", df)


1d array: [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]

 2d array: [[ 1  2  3  4]
 [ 5  6  7  8]
 [ 9 10 11 12]
 [13 14 15 16]
 [17 18 19 20]]

 DataFrame:     A   B   C   D
0   1   2   3   4
1   5   6   7   8
2   9  10  11  12
3  13  14  15  16
4  17  18  19  20


In [14]:
# Generating random DataFrame
df = pd.DataFrame({
    'A': np.random.rand(5),
    'B': np.random.rand(5)
})

print("DataFrame:", df)

DataFrame:           A         B
0  0.729288  0.350330
1  0.198970  0.181211
2  0.158525  0.108594
3  0.853634  0.283908
4  0.436089  0.668207


In [15]:
# with rows and columns names
data = np.arange(1, 21)
reshaped_data = data.reshape(5, 4)
df = pd.DataFrame(reshaped_data, index=['Row1', 'Row2', 'Row3', 'Row4', 'Row5'], columns=['Column1', 'Column2', 'Column3', 'Column4'])
print(df)


      Column1  Column2  Column3  Column4
Row1        1        2        3        4
Row2        5        6        7        8
Row3        9       10       11       12
Row4       13       14       15       16
Row5       17       18       19       20


In [16]:
# Get top 5 rows
# if n is empty it will return the first 5 rows
n = 3
top_rows = df.head(n)
print("\nTop n rows:\n", top_rows)


Top n rows:
       Column1  Column2  Column3  Column4
Row1        1        2        3        4
Row2        5        6        7        8
Row3        9       10       11       12


In [17]:
# Get last 5 rows
# if n is empty it will return the last 5 rows
n = 3
bottom_rows = df.tail(n)
print("\nBottom n rows:\n", bottom_rows)


Bottom n rows:
       Column1  Column2  Column3  Column4
Row3        9       10       11       12
Row4       13       14       15       16
Row5       17       18       19       20


In [None]:
# Getting columns datatype and non-null counts
# This provides a quick overview of the DataFrame's structure
df_info = df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5 entries, Row1 to Row5
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   Column1  5 non-null      int64
 1   Column2  5 non-null      int64
 2   Column3  5 non-null      int64
 3   Column4  5 non-null      int64
dtypes: int64(4)
memory usage: 200.0+ bytes


In [None]:
# Describe DataFrame
# Get a statistical summary of the DataFrame
# This includes count, mean, std, min, 25%, 50%, 75%, max for each numeric column
df_description = df.describe()
print("\nDataFrame Description:\n", df_description)



DataFrame Description:
          Column1    Column2    Column3    Column4
count   5.000000   5.000000   5.000000   5.000000
mean    9.000000  10.000000  11.000000  12.000000
std     6.324555   6.324555   6.324555   6.324555
min     1.000000   2.000000   3.000000   4.000000
25%     5.000000   6.000000   7.000000   8.000000
50%     9.000000  10.000000  11.000000  12.000000
75%    13.000000  14.000000  15.000000  16.000000
max    17.000000  18.000000  19.000000  20.000000


In [67]:
## Indexing and selecting data in a DataFrame
# Display the original DataFrame
print("\nOriginal DataFrame:\n", df)

# Select a single column
# Single Column or Single row called a Series
column_1 = df['Column1']
print("Column1:\n", column_1)

# Select multiple columns
# Multiple columns are still a DataFrame
columns_1_2 = df[['Column1', 'Column2']]
print("\nColumns 1 and 2:\n", columns_1_2)

# Select rows by label using .loc
row_0 = df.loc['Row1']
print("\nRow 0:\n", row_0)

# Select multiple rows by label using .loc
rows_0_1 = df.loc[['Row1', 'Row2']]
print("\nRows 0 and 1:\n", rows_0_1)

# Select multiple rows by integer location using .iloc
rows_0_1 = df.iloc[0:3]
print("\nRows 0 and 1 (using iloc):\n", rows_0_1)

# Select multiple columns by integer location using .iloc
columns_1_2 = df.iloc[:, 1:3]
print("\nColumns 1 and 2 (using iloc):\n", columns_1_2)

# Select specific rows and columns using .iloc
# Bounded by the rows and columns specified (0 to 2 for rows and 1 to 3 for columns)
# This will select rows 0 and 1, and columns 1 and 2
specific_rows_columns = df.iloc[0:2, 1:3]
print("\nSpecific Rows and Columns (using iloc):\n", specific_rows_columns)



Original DataFrame:
       Column1  Column2  Column3  Column4
Row1        1        2        3        4
Row2        5        6        7        8
Row3        9       10       11       12
Row4       13       14       15       16
Row5       17       18       19       20
Column1:
 Row1     1
Row2     5
Row3     9
Row4    13
Row5    17
Name: Column1, dtype: int64

Columns 1 and 2:
       Column1  Column2
Row1        1        2
Row2        5        6
Row3        9       10
Row4       13       14
Row5       17       18

Row 0:
 Column1    1
Column2    2
Column3    3
Column4    4
Name: Row1, dtype: int64

Rows 0 and 1:
       Column1  Column2  Column3  Column4
Row1        1        2        3        4
Row2        5        6        7        8

Rows 0 and 1 (using iloc):
       Column1  Column2  Column3  Column4
Row1        1        2        3        4
Row2        5        6        7        8
Row3        9       10       11       12

Columns 1 and 2 (using iloc):
       Column2  Column3
Row1     