## 1. Clean a dataset by converting strings to datetime objects and extracting specific date components.

In [2]:
import pandas as pd

# Sample data
data = {
    'order_id': [1, 2, 3, 4, 5],
    'order_date': ['2024-06-01', '2024-07-15', '2024-08-10', '2024-09-05', '2024-10-20']
}

# Creating DataFrame
df = pd.DataFrame(data)

# Convert 'order_date' to datetime
df['order_date'] = pd.to_datetime(df['order_date'])

# Extract year, month, and day into separate columns
df['year'] = df['order_date'].dt.year
df['month'] = df['order_date'].dt.month
df['day'] = df['order_date'].dt.day

# Display the transformed DataFrame
print("Transformed DataFrame:")
df

Transformed DataFrame:


Unnamed: 0,order_id,order_date,year,month,day
0,1,2024-06-01,2024,6,1
1,2,2024-07-15,2024,7,15
2,3,2024-08-10,2024,8,10
3,4,2024-09-05,2024,9,5
4,5,2024-10-20,2024,10,20


##  2. Calculate the difference in days between two date columns.

In [4]:
import pandas as pd

# Sample data
data = {
    'event_id': [1, 2, 3, 4, 5],
    'start_date': ['2024-06-01', '2024-07-15', '2024-08-10', '2024-09-05', '2024-10-20'],
    'end_date': ['2024-06-10', '2024-07-20', '2024-08-15', '2024-09-10', '2024-10-25']
}

# Creating DataFrame
df = pd.DataFrame(data)

# Convert 'start_date' and 'end_date' to datetime
df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])

# Calculate the difference in days
df['duration_days'] = (df['end_date'] - df['start_date']).dt.days

# Display the transformed DataFrame
print("Transformed DataFrame:")
df


Transformed DataFrame:


Unnamed: 0,event_id,start_date,end_date,duration_days
0,1,2024-06-01,2024-06-10,9
1,2,2024-07-15,2024-07-20,5
2,3,2024-08-10,2024-08-15,5
3,4,2024-09-05,2024-09-10,5
4,5,2024-10-20,2024-10-25,5


## 3. Filter a DataFrame to include only rows where the date falls within a specified range.

In [6]:
import pandas as pd

# Sample data
data = {
    'transaction_id': [101, 102, 103, 104, 105],
    'transaction_date': ['2024-06-01', '2024-06-15', '2024-07-01', '2024-07-15', '2024-08-01'],
    'amount': [100, 200, 150, 300, 250]
}

# Creating DataFrame
df = pd.DataFrame(data)

# Convert 'transaction_date' to datetime
df['transaction_date'] = pd.to_datetime(df['transaction_date'])

# Define the date range
start_date = '2024-06-01'
end_date = '2024-07-01'

# Filter transactions within the specified date range
filtered_df = df[(df['transaction_date'] >= start_date) & (df['transaction_date'] <= end_date)]

# Display the filtered DataFrame
print("Filtered DataFrame:")
filtered_df


Filtered DataFrame:


Unnamed: 0,transaction_id,transaction_date,amount
0,101,2024-06-01,100
1,102,2024-06-15,200
2,103,2024-07-01,150


## 4. Group a DataFrame by month and summarize the total amount for each month.

In [7]:
import pandas as pd

# Sample data
data = {
    'transaction_id': [101, 102, 103, 104, 105, 106, 107],
    'transaction_date': ['2024-06-01', '2024-06-15', '2024-07-01', '2024-07-15', '2024-08-01', '2024-08-15', '2024-09-01'],
    'amount': [100, 200, 150, 300, 250, 350, 400]
}

# Creating DataFrame
df = pd.DataFrame(data)

# Convert 'transaction_date' to datetime
df['transaction_date'] = pd.to_datetime(df['transaction_date'])

# Extract the month
df['month'] = df['transaction_date'].dt.to_period('M')

# Group by month and summarize the total amount
monthly_summary = df.groupby('month')['amount'].sum().reset_index()

# Display the summarized DataFrame
print("Monthly Summary:")
monthly_summary


Monthly Summary:


Unnamed: 0,month,amount
0,2024-06,300
1,2024-07,450
2,2024-08,600
3,2024-09,400


## 5. Calculate a rolling average of transaction amounts over a specified window of days.

In [8]:
import pandas as pd

# Sample data
data = {
    'transaction_id': [101, 102, 103, 104, 105, 106, 107],
    'transaction_date': ['2024-06-01', '2024-06-15', '2024-07-01', '2024-07-15', '2024-08-01', '2024-08-15', '2024-09-01'],
    'amount': [100, 200, 150, 300, 250, 350, 400]
}

# Creating DataFrame
df = pd.DataFrame(data)

# Convert 'transaction_date' to datetime
df['transaction_date'] = pd.to_datetime(df['transaction_date'])

# Set 'transaction_date' as the index
df.set_index('transaction_date', inplace=True)

# Calculate the 3-day rolling average of the 'amount' column
df['rolling_avg'] = df['amount'].rolling(window=3).mean()

# Display the DataFrame with the rolling average
print("DataFrame with Rolling Average:")
df


DataFrame with Rolling Average:


Unnamed: 0_level_0,transaction_id,amount,rolling_avg
transaction_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-06-01,101,100,
2024-06-15,102,200,
2024-07-01,103,150,150.0
2024-07-15,104,300,216.666667
2024-08-01,105,250,233.333333
2024-08-15,106,350,300.0
2024-09-01,107,400,333.333333


## 6. Resample a DataFrame to aggregate data by month.

In [10]:
import pandas as pd

# Sample data
data = {
    'transaction_id': [101, 102, 103, 104, 105, 106, 107],
    'transaction_date': ['2024-06-01', '2024-06-15', '2024-07-01', '2024-07-15', '2024-08-01', '2024-08-15', '2024-09-01'],
    'amount': [100, 200, 150, 300, 250, 350, 400]
}

# Creating DataFrame
df = pd.DataFrame(data)

# Convert 'transaction_date' to datetime
df['transaction_date'] = pd.to_datetime(df['transaction_date'])

# Set 'transaction_date' as the index
df.set_index('transaction_date', inplace=True)

# Resample data to aggregate the total amount by month
monthly_aggregation = df.resample('ME').sum()

# Display the resampled DataFrame
print("Monthly Aggregated DataFrame:")
monthly_aggregation


Monthly Aggregated DataFrame:


Unnamed: 0_level_0,transaction_id,amount
transaction_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-06-30,203,300
2024-07-31,207,450
2024-08-31,211,600
2024-09-30,107,400


## 7. Create lagged features for time series data to use in predictive modeling.



In [11]:
import pandas as pd

# Sample data
data = {
    'transaction_id': [101, 102, 103, 104, 105, 106, 107],
    'transaction_date': ['2024-06-01', '2024-06-15', '2024-07-01', '2024-07-15', '2024-08-01', '2024-08-15', '2024-09-01'],
    'amount': [100, 200, 150, 300, 250, 350, 400]
}

# Creating DataFrame
df = pd.DataFrame(data)

# Convert 'transaction_date' to datetime
df['transaction_date'] = pd.to_datetime(df['transaction_date'])

# Set 'transaction_date' as the index
df.set_index('transaction_date', inplace=True)

# Create a lagged feature for 'amount'
df['amount_lagged'] = df['amount'].shift(1)

# Display the DataFrame with the lagged feature
print("DataFrame with Lagged Feature:")
df


DataFrame with Lagged Feature:


Unnamed: 0_level_0,transaction_id,amount,amount_lagged
transaction_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2024-06-01,101,100,
2024-06-15,102,200,100.0
2024-07-01,103,150,200.0
2024-07-15,104,300,150.0
2024-08-01,105,250,300.0
2024-08-15,106,350,250.0
2024-09-01,107,400,350.0


## 8. Handle missing dates in a time series and impute missing values.

In [13]:
import pandas as pd

# Sample data with missing dates
data = {
    'transaction_id': [101, 102, 103, 104, 105],
    'transaction_date': ['2024-06-01', '2024-06-15', '2024-07-01', '2024-07-15', '2024-08-01'],
    'amount': [100, 200, 150, 300, 250]
}

# Creating DataFrame
df = pd.DataFrame(data)

# Convert 'transaction_date' to datetime
df['transaction_date'] = pd.to_datetime(df['transaction_date'])

# Set 'transaction_date' as the index
df.set_index('transaction_date', inplace=True)

# Reindex to fill missing dates
full_range = pd.date_range(start=df.index.min(), end=df.index.max(), freq='D')
df = df.reindex(full_range)

# Impute missing values (forward fill)
df['amount'] = df['amount'].ffill()

# Display the DataFrame with missing dates handled
print("DataFrame with Missing Dates Handled:")
df


DataFrame with Missing Dates Handled:


Unnamed: 0,transaction_id,amount
2024-06-01,101.0,100.0
2024-06-02,,100.0
2024-06-03,,100.0
2024-06-04,,100.0
2024-06-05,,100.0
...,...,...
2024-07-28,,300.0
2024-07-29,,300.0
2024-07-30,,300.0
2024-07-31,,300.0


## 9. Convert a DataFrame's date column to UTC and then to a specific time zone.

In [14]:
import pandas as pd

# Sample data with time zones
data = {
    'transaction_id': [101, 102, 103, 104, 105],
    'transaction_date': ['2024-06-01 10:00:00', '2024-06-15 12:00:00', '2024-07-01 15:00:00', '2024-07-15 09:00:00', '2024-08-01 17:00:00']
}

# Creating DataFrame
df = pd.DataFrame(data)

# Convert 'transaction_date' to datetime
df['transaction_date'] = pd.to_datetime(df['transaction_date'])

# Set 'transaction_date' to UTC
df['transaction_date_utc'] = df['transaction_date'].dt.tz_localize('UTC')

# Convert 'transaction_date' to US/Eastern time zone
df['transaction_date_est'] = df['transaction_date_utc'].dt.tz_convert('US/Eastern')

# Display the DataFrame with time zone changes
print("DataFrame with Time Zone Changes:")
df


DataFrame with Time Zone Changes:


Unnamed: 0,transaction_id,transaction_date,transaction_date_utc,transaction_date_est
0,101,2024-06-01 10:00:00,2024-06-01 10:00:00+00:00,2024-06-01 06:00:00-04:00
1,102,2024-06-15 12:00:00,2024-06-15 12:00:00+00:00,2024-06-15 08:00:00-04:00
2,103,2024-07-01 15:00:00,2024-07-01 15:00:00+00:00,2024-07-01 11:00:00-04:00
3,104,2024-07-15 09:00:00,2024-07-15 09:00:00+00:00,2024-07-15 05:00:00-04:00
4,105,2024-08-01 17:00:00,2024-08-01 17:00:00+00:00,2024-08-01 13:00:00-04:00


## 10. Convert Unix Epoch time to datetime and perform operations using time deltas.

In [15]:
import pandas as pd

# Sample data with Unix Epoch time
data = {
    'event_id': [1, 2, 3, 4, 5],
    'event_timestamp': [1716115200, 1717228800, 1718342400, 1719456000, 1720569600]  # Unix Epoch time
}

# Creating DataFrame
df = pd.DataFrame(data)

# Convert 'event_timestamp' to datetime
df['event_date'] = pd.to_datetime(df['event_timestamp'], unit='s')

# Define a specific date to calculate time delta
specific_date = pd.to_datetime('2024-06-01')

# Calculate the time delta
df['time_delta'] = df['event_date'] - specific_date

# Display the DataFrame with datetime and time delta
print("DataFrame with DateTime and Time Delta:")
df

DataFrame with DateTime and Time Delta:


Unnamed: 0,event_id,event_timestamp,event_date,time_delta
0,1,1716115200,2024-05-19 10:40:00,-13 days +10:40:00
1,2,1717228800,2024-06-01 08:00:00,0 days 08:00:00
2,3,1718342400,2024-06-14 05:20:00,13 days 05:20:00
3,4,1719456000,2024-06-27 02:40:00,26 days 02:40:00
4,5,1720569600,2024-07-10 00:00:00,39 days 00:00:00


## 11. Use date offsets to manipulate and analyze time series data.

In [16]:
import pandas as pd

# Sample data
data = {
    'transaction_id': [101, 102, 103, 104, 105],
    'transaction_date': ['2024-06-01', '2024-06-15', '2024-07-01', '2024-07-15', '2024-08-01'],
    'amount': [100, 200, 150, 300, 250]
}

# Creating DataFrame
df = pd.DataFrame(data)

# Convert 'transaction_date' to datetime
df['transaction_date'] = pd.to_datetime(df['transaction_date'])

# Add 1 month to 'transaction_date'
df['transaction_date_plus_1m'] = df['transaction_date'] + pd.DateOffset(months=1)

# Subtract 15 days from 'transaction_date'
df['transaction_date_minus_15d'] = df['transaction_date'] - pd.DateOffset(days=15)

# Display the DataFrame with date offsets
print("DataFrame with Date Offsets:")
df

DataFrame with Date Offsets:


Unnamed: 0,transaction_id,transaction_date,amount,transaction_date_plus_1m,transaction_date_minus_15d
0,101,2024-06-01,100,2024-07-01,2024-05-17
1,102,2024-06-15,200,2024-07-15,2024-05-31
2,103,2024-07-01,150,2024-08-01,2024-06-16
3,104,2024-07-15,300,2024-08-15,2024-06-30
4,105,2024-08-01,250,2024-09-01,2024-07-17


## 12. Generate a range of dates and create a time series dataset.

In [17]:
import pandas as pd
import numpy as np

# Generate a date range from '2024-06-01' to '2024-06-10' with a daily frequency
date_range = pd.date_range(start='2024-06-01', end='2024-06-10', freq='D')

# Create a DataFrame using the generated date range
df = pd.DataFrame(date_range, columns=['date'])

# Add a column with random data
np.random.seed(0)  # For reproducibility
df['random_data'] = np.random.randint(1, 100, size=len(date_range))

# Display the DataFrame
print("DataFrame with Date Range and Random Data:")
df


DataFrame with Date Range and Random Data:


Unnamed: 0,date,random_data
0,2024-06-01,45
1,2024-06-02,48
2,2024-06-03,65
3,2024-06-04,68
4,2024-06-05,68
5,2024-06-06,10
6,2024-06-07,84
7,2024-06-08,22
8,2024-06-09,37
9,2024-06-10,88


# CHALLENGE: Analyzing Sales Data with Date and Time Functions

Analyze a sales dataset by applying multiple date and time functions, including converting date strings, calculating time differences, filtering by date range, resampling data, and generating date ranges for forecasting.

**Instructions:**

* Convert Date Strings to Datetime Objects: You have a dataset with sales transactions. First, convert the sale_date column from string format to datetime objects.
* Calculate Days Since First Sale: Calculate the number of days since the first sale for each transaction.
* Filter Sales Data: Filter the dataset to include only transactions from the last 30 days.
* Resample Data by Week: Aggregate the total sales amount by week.
* Generate Future Dates for Forecasting: Create a future date range for the next 4 weeks and simulate future sales data based on historical trends.

In [19]:
import pandas as pd
import numpy as np

# Sample sales data
data = {
    'sale_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'sale_date': ['2024-06-01', '2024-06-05', '2024-06-10', '2024-06-15', '2024-06-20',
                  '2024-06-25', '2024-07-01', '2024-07-05', '2024-07-10', '2024-07-15'],
    'amount': [100, 150, 200, 250, 300, 350, 400, 450, 500, 550]
}

# Creating DataFrame
df = pd.DataFrame(data)

# Task 1: Convert 'sale_date' to datetime
df['sale_date'] = pd.to_datetime(df['sale_date'])

# Task 2: Calculate days since first sale
df['days_since_first_sale'] = (df['sale_date'] - df['sale_date'].min()).dt.days

# Task 3: Filter sales data from the last 30 days
end_date = df['sale_date'].max()
start_date = end_date - pd.DateOffset(days=30)
filtered_df = df[(df['sale_date'] >= start_date) & (df['sale_date'] <= end_date)]

# Task 4: Resample data by week and aggregate total sales
weekly_sales = filtered_df.resample('W-Mon', on='sale_date')['amount'].sum().reset_index().sort_values('sale_date')

# Task 5: Generate future dates for the next 4 weeks and simulate future sales
future_dates = pd.date_range(start=weekly_sales['sale_date'].max() + pd.DateOffset(weeks=1), periods=4, freq='W-Mon')
future_sales = pd.DataFrame(future_dates, columns=['sale_date'])
# Simulate future sales by taking the average of past sales
future_sales['amount'] = weekly_sales['amount'].mean()

# Combine historical and future sales
combined_sales = pd.concat([weekly_sales, future_sales]).reset_index(drop=True)

# Display the combined sales data
print("Combined Sales Data (Historical + Future):")
combined_sales


Combined Sales Data (Historical + Future):


Unnamed: 0,sale_date,amount
0,2024-06-17,250.0
1,2024-06-24,300.0
2,2024-07-01,750.0
3,2024-07-08,450.0
4,2024-07-15,1050.0
5,2024-07-22,560.0
6,2024-07-29,560.0
7,2024-08-05,560.0
8,2024-08-12,560.0


# Challenge: Comprehensive Sales Data Analysis and Feature Engineering
Perform a comprehensive analysis on a sales dataset, combining multiple Pandas techniques learned throughout the course. This includes date and time manipulation, merging and concatenating DataFrames, filtering, grouping, handling missing data, and feature 

**Instructions:**

1. **Load Sales and Customer Data**: You have two datasets, one with sales transactions and another with customer information. Load both datasets into separate DataFrames.
2. **Merge DataFrames**: Merge the sales and customer DataFrames on a common key (e.g., `customer_id`).
3. **Convert Date Strings to Datetime Objects**: Convert the `sale_date` column from string format to datetime objects.
4. **Calculate Sales Metrics**: Calculate the total sales amount for each customer and each product.
5. **Handle Missing Data**: Identify missing values in the `amount` column and use interpolation to fill them.
6. **Feature Engineering**: Create new features such as the year, month, and quarter from the `sale_date` column. Also, split the `customer_name` column into `first_name` and `last_name`.
7. **Group and Filter Data**: Group the data by customer and filter out customers with total sales above a certain threshold.
8. **Resample and Concatenate Data**: Resample the data to get weekly sales totals and concatenate it with future sales projections.
9. **Final Output**: Provide a summary DataFrame with all the calculations and features.

In [27]:
import pandas as pd
import numpy as np

# Sample sales data
sales_data = {
    'sale_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'customer_id': [1, 2, 1, 3, 2, 1, 3, 2, 1, 3],
    'sale_date': ['2024-06-01', '2024-06-05', '2024-06-10', '2024-06-15', '2024-06-20',
                  '2024-06-25', '2024-07-01', '2024-07-05', '2024-07-10', '2024-07-15'],
    'product': ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'],
    'amount': [100, np.nan, 200, 250, 300, np.nan, 400, 450, 500, 550]
}

# Sample customer data
customer_data = {
    'customer_id': [1, 2, 3],
    'customer_name': ['John Doe', 'Jane Smith', 'Bob Johnson']
}

# Creating DataFrames
sales_df = pd.DataFrame(sales_data)
customer_df = pd.DataFrame(customer_data)

# Task 1: Merge sales_df and customer_df on 'customer_id'
merged_df = pd.merge(sales_df, customer_df, on='customer_id')

# Task 2: Convert 'sale_date' to datetime
merged_df['sale_date'] = pd.to_datetime(merged_df['sale_date'])

# Task 3: Calculate total sales amount for each customer and each product
total_sales_by_customer = merged_df.groupby('customer_name')['amount'].sum().reset_index()
total_sales_by_product = merged_df.groupby('product')['amount'].sum().reset_index()

# Task 4: Handle missing data in 'amount' using interpolation
merged_df['amount'] = merged_df['amount'].interpolate()

# Task 5: Create new features like year, month, and quarter from 'sale_date'
merged_df['year'] = merged_df['sale_date'].dt.year
merged_df['month'] = merged_df['sale_date'].dt.month
merged_df['quarter'] = merged_df['sale_date'].dt.quarter

# Task 6: Split 'customer_name' into 'first_name' and 'last_name'
merged_df[['first_name', 'last_name']] = merged_df['customer_name'].str.split(' ', expand=True)

# Task 7: Group data by customer and filter customers with total sales above a threshold
threshold = 500
customer_sales = merged_df.groupby('customer_name')['amount'].sum().reset_index()
filtered_customers = customer_sales[customer_sales['amount'] > threshold]

# Task 8: Resample data to get weekly sales totals and concatenate with future sales projections
weekly_sales = merged_df.resample('W-Mon', on='sale_date')['amount'].sum().reset_index().sort_values('sale_date')

# Generate future dates for the next 4 weeks and simulate future sales
future_dates = pd.date_range(start=weekly_sales['sale_date'].max() + pd.DateOffset(weeks=1), periods=4, freq='W-Mon')
future_sales = pd.DataFrame(future_dates, columns=['sale_date'])
future_sales['amount'] = weekly_sales['amount'].mean()

# Combine historical and future sales
combined_sales = pd.concat([weekly_sales, future_sales]).reset_index(drop=True)

# Task 9: Provide a summary DataFrame with all calculations and features
summary_df = merged_df[['customer_id', 'first_name', 'last_name', 'sale_date', 'product', 'amount', 'year', 'month', 'quarter']]

# Display the summary DataFrame
print("Summary DataFrame:")
display(summary_df)

# Display the filtered customers
print("\nFiltered Customers (Total Sales > 500):")
display(filtered_customers)

# Display the combined sales data (historical + future)
print("\nCombined Sales Data (Historical + Future):")
display(combined_sales)


Summary DataFrame:


Unnamed: 0,customer_id,first_name,last_name,sale_date,product,amount,year,month,quarter
0,1,John,Doe,2024-06-01,A,100.0,2024,6,2
1,2,Jane,Smith,2024-06-05,B,150.0,2024,6,2
2,1,John,Doe,2024-06-10,A,200.0,2024,6,2
3,3,Bob,Johnson,2024-06-15,B,250.0,2024,6,2
4,2,Jane,Smith,2024-06-20,A,300.0,2024,6,2
5,1,John,Doe,2024-06-25,B,350.0,2024,6,2
6,3,Bob,Johnson,2024-07-01,A,400.0,2024,7,3
7,2,Jane,Smith,2024-07-05,B,450.0,2024,7,3
8,1,John,Doe,2024-07-10,A,500.0,2024,7,3
9,3,Bob,Johnson,2024-07-15,B,550.0,2024,7,3



Filtered Customers (Total Sales > 500):


Unnamed: 0,customer_name,amount
0,Bob Johnson,1200.0
1,Jane Smith,900.0
2,John Doe,1150.0



Combined Sales Data (Historical + Future):


Unnamed: 0,sale_date,amount
0,2024-06-03,100.0
1,2024-06-10,350.0
2,2024-06-17,250.0
3,2024-06-24,300.0
4,2024-07-01,750.0
5,2024-07-08,450.0
6,2024-07-15,1050.0
7,2024-07-22,464.285714
8,2024-07-29,464.285714
9,2024-08-05,464.285714
