In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [11]:
# checking pandas version
print(f"Pandas version: {pd.__version__}")

Pandas version: 2.3.3


#### Pandas Series

In [12]:
ml_marks = [50, 70, 82, 63, 38]
type(ml_marks)

list

In [13]:
# creating a pandas series
ml_series = pd.Series(ml_marks)
ml_series

0    50
1    70
2    82
3    63
4    38
dtype: int64

In [14]:
type(ml_series)

pandas.core.series.Series

In [15]:
# Creating a series with a custom index
ml_series_idx = pd.Series(
    ml_marks,
    index=['Vanessa', 'Christine', 'Peter', 'Diana', 'Racheal'],
    name='ML Marks'
)
ml_series_idx

Vanessa      50
Christine    70
Peter        82
Diana        63
Racheal      38
Name: ML Marks, dtype: int64

In [16]:
# Accessing elements
print(f'The first element in "ML series" is {ml_series[0]}')
print(f'The first element in "ML series" is {ml_series.loc[0]}')
print(f'The first element in "ML series" is {ml_series.iloc[0]}')

The first element in "ML series" is 50
The first element in "ML series" is 50
The first element in "ML series" is 50


In [17]:
print(f'Peter got {ml_series_idx["Peter"]} in the ML course')
print(f'Peter got {ml_series_idx.loc["Peter"]} in the ML course')
print(f'Peter got {ml_series_idx.iloc[2]} in the ML course') # gives an error. Why?
#iloc only accepts integers

Peter got 82 in the ML course
Peter got 82 in the ML course
Peter got 82 in the ML course


#### Pandas DataFrame

In [18]:
# Creating a dataframe from a dictionary. Keys are column names and values are column values
example_df = pd.DataFrame({'Name': ['Vanessa', 'Christine', 'Peter', 'Diana', 'Racheal'],
                    'Age': [18, 20, 25, 30, 35],
                    'ML mark': [50, 70, 82, 63, 38]
                   })
example_df

Unnamed: 0,Name,Age,ML mark
0,Vanessa,18,50
1,Christine,20,70
2,Peter,25,82
3,Diana,30,63
4,Racheal,35,38


In [19]:
# creating a dataframe from a list of dictionaries
data = [
    {'Name': 'Vanessa', 'Age': 18, 'ML mark': 50},
    {'Name': 'Christine', 'Age': 20, 'ML mark': 70},
    {'Name': 'Peter', 'Age': 25, 'ML mark': 82},
    {'Name': 'Diana', 'Age': 30, 'ML mark': 63},
    {'Name': 'Racheal', 'Age': 35, 'ML mark': 38}
]

example_df2 = pd.DataFrame(data)
example_df2

Unnamed: 0,Name,Age,ML mark
0,Vanessa,18,50
1,Christine,20,70
2,Peter,25,82
3,Diana,30,63
4,Racheal,35,38


In [20]:
# creating a dataframe from two lists
ml_age = [18, 20, 25, 30, 35]
example_df3 = pd.DataFrame(zip(ml_age, ml_marks), columns = ['Age', 'ML mark'])
example_df3

Unnamed: 0,Age,ML mark
0,18,50
1,20,70
2,25,82
3,30,63
4,35,38


In [21]:
# dataframe properties
example_df.shape

(5, 3)

In [22]:
example_df.columns.tolist()

['Name', 'Age', 'ML mark']

In [30]:
example_df.dtypes

Name       object
Age         int64
ML mark     int64
dtype: object

In [24]:
example_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Name     5 non-null      object
 1   Age      5 non-null      int64 
 2   ML mark  5 non-null      int64 
dtypes: int64(2), object(1)
memory usage: 252.0+ bytes


In [25]:
# first 2 rows
example_df.head(2)

Unnamed: 0,Name,Age,ML mark
0,Vanessa,18,50
1,Christine,20,70


In [26]:
# last 2 rows ?
example_df.tail(2)

Unnamed: 0,Name,Age,ML mark
3,Diana,30,63
4,Racheal,35,38


In [27]:
# first 2 columns
example_df[["Name", "Age"]]

Unnamed: 0,Name,Age
0,Vanessa,18
1,Christine,20
2,Peter,25
3,Diana,30
4,Racheal,35


In [37]:
# the third and fourth rows and the last 2 columns
example_df.iloc[2:4, -2:]



Unnamed: 0,Age,ML mark
2,25,82
3,30,63


In [None]:
# single column
example_df['Age'] # or example_df.Age

0    18
1    20
2    25
3    30
4    35
Name: Age, dtype: int64

In [None]:
# multiple columns
example_df[['Name', 'Age']]

Unnamed: 0,Name,Age
0,Vanessa,18
1,Christine,20
2,Peter,25
3,Diana,30
4,Racheal,35


In [None]:
example_df.iloc[3]

Name       Diana
Age           30
ML mark       63
Name: 3, dtype: object

In [None]:
# example_df.iloc[:3, :2]  - what does this give you?

In [None]:
# filtering based on conditional statements
example_df[example_df.Age > 20]

Unnamed: 0,Name,Age,ML mark
2,Peter,25,82
3,Diana,30,63
4,Racheal,35,38


In [None]:
print(example_df[(example_df['Age'] > 20) & (example_df['ML mark'] > 80)])

    Name  Age  ML mark
2  Peter   25       82


In [None]:
# creating a new column
example_df['New ML mark'] = example_df['ML mark'] + 3
example_df.head(1)

Unnamed: 0,Name,Age,ML mark,New ML mark
0,Vanessa,18,50,53


In [None]:
# using a lambda function
example_df['New Age'] = example_df['Age'].apply(lambda x: x+3)
example_df.head(1)

Unnamed: 0,Name,Age,ML mark,New ML mark,New Age
0,Vanessa,18,50,53,21


In [None]:
example_df['Age group'] = example_df['Age'].apply(lambda x: 'Teenager' if x<=18 else 'Adult')
example_df

Unnamed: 0,Name,Age,ML mark,New ML mark,New Age,Age group
0,Vanessa,18,50,53,21,Teenager
1,Christine,20,70,73,23,Adult
2,Peter,25,82,85,28,Adult
3,Diana,30,63,66,33,Adult
4,Racheal,35,38,41,38,Adult


### **Exercise**
a. Create a dataframe with 30 rows containing fictional house data (price, size, bedrooms, location). The dataframe must include the following columns: price (in UGX), size_sqm, bedrooms, bathrooms, location, year_built, and property_type (e.g. apartment, bungalow, mansion). Use numpy to randomly generate the numerical columns with realistic constraints. For example, larger house have a higher price. Set a random seed for reproducibility. Finally, display the shape, dtypes, and first 8 rows of the dataframe. <br/><br/> 
b. From the dataframe above, extract all houses that are located in two particular locations that have a price above a specified threshold, and at least 3 bedrooms. Reset the index of the result. What is the average price of the filtered results? Which property type appears most frequently in the filtered results? <br/><br/>
c. Extract rows 5 to 14 (inclusive) and columns 2 and 3 only. With the result, rename the columns to something meaningful. Calculate the mean and standard deviation of each column in the slice. Display the row from the slice that has the highest value in the first of the two selected columns. <br/><br/>
d. Create a new column *deposit* that is 10% of the price using four different approaches and verify that they all give the same result.