## 1. Clean a dataset by converting strings to datetime objects and extracting specific date components.

In [12]:
import pandas as pd

# Sample data
data = {
    'order_id': [1, 2, 3, 4, 5],
    'order_date': ['2024-06-01', '2024-07-15', '2024-08-10', '2024-09-05', '2024-10-20']
}

# Creating DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print("Original DataFrame:")

# Task: Convert 'order_date' to datetime and extract year, month, and day into separate columns
df['order_date'] = pd.to_datetime(df['order_date'])
df['year'] = df['order_date'].dt.year
df['month'] = df['order_date'].dt.month
df['day'] = df['order_date'].dt.day
df

Original DataFrame:


Unnamed: 0,order_id,order_date,year,month,day
0,1,2024-06-01,2024,6,1
1,2,2024-07-15,2024,7,15
2,3,2024-08-10,2024,8,10
3,4,2024-09-05,2024,9,5
4,5,2024-10-20,2024,10,20


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   order_id    5 non-null      int64         
 1   order_date  5 non-null      datetime64[ns]
 2   year        5 non-null      int32         
 3   month       5 non-null      int32         
 4   day         5 non-null      int32         
dtypes: datetime64[ns](1), int32(3), int64(1)
memory usage: 272.0 bytes


##  2. Calculate the difference in days between two date columns.

In [17]:
import pandas as pd

# Sample data
data = {
    'event_id': [1, 2, 3, 4, 5],
    'start_date': ['2024-06-01', '2024-07-15', '2024-08-10', '2024-09-05', '2024-10-20'],
    'end_date': ['2024-06-10', '2024-07-20', '2024-08-15', '2024-09-10', '2024-10-25']
}

# Creating DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print("Original DataFrame:")

# Task: Convert 'start_date' and 'end_date' to datetime and calculate the difference in days
df['start_date'] = pd.to_datetime(df['start_date'])
df['end_date'] = pd.to_datetime(df['end_date'])
df['duration_days'] = (df['end_date'] - df['start_date']).dt.days
df

Original DataFrame:


Unnamed: 0,event_id,start_date,end_date,duration_days
0,1,2024-06-01,2024-06-10,9
1,2,2024-07-15,2024-07-20,5
2,3,2024-08-10,2024-08-15,5
3,4,2024-09-05,2024-09-10,5
4,5,2024-10-20,2024-10-25,5


## 3. Filter a DataFrame to include only rows where the date falls within a specified range.

In [21]:
import pandas as pd

# Sample data
data = {
    'transaction_id': [101, 102, 103, 104, 105],
    'transaction_date': ['2024-06-01', '2024-06-15', '2024-07-01', '2024-07-15', '2024-08-01'],
    'amount': [100, 200, 150, 300, 250]
}

# Creating DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print("Original DataFrame:")
print(df)

# Task: Convert 'transaction_date' to datetime and filter transactions between '2024-06-01' and '2024-07-01'
df['transaction_date'] = pd.to_datetime(df['transaction_date'])
mask = (df['transaction_date'] >= '2024-06-01') & (df['transaction_date'] <= '2024-07-01')
df.loc[mask]

Original DataFrame:
   transaction_id transaction_date  amount
0             101       2024-06-01     100
1             102       2024-06-15     200
2             103       2024-07-01     150
3             104       2024-07-15     300
4             105       2024-08-01     250


Unnamed: 0,transaction_id,transaction_date,amount
0,101,2024-06-01,100
1,102,2024-06-15,200
2,103,2024-07-01,150


## 4. Group a DataFrame by month and summarize the total amount for each month.

In [25]:
import pandas as pd

# Sample data
data = {
    'transaction_id': [101, 102, 103, 104, 105, 106, 107],
    'transaction_date': ['2024-06-01', '2024-06-15', '2024-07-01', '2024-07-15', '2024-08-01', '2024-08-15', '2024-09-01'],
    'amount': [100, 200, 150, 300, 250, 350, 400]
}

# Creating DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print("Original DataFrame:")
print(df)

# Task: Convert 'transaction_date' to datetime, extract the month, and summarize the total amount for each month
df['transaction_date'] = pd.to_datetime(df['transaction_date'])
df['month'] = df['transaction_date'].dt.strftime('%Y-%m')
pd.DataFrame(df.groupby('month')['amount'].sum().reset_index())

Original DataFrame:
   transaction_id transaction_date  amount
0             101       2024-06-01     100
1             102       2024-06-15     200
2             103       2024-07-01     150
3             104       2024-07-15     300
4             105       2024-08-01     250
5             106       2024-08-15     350
6             107       2024-09-01     400


Unnamed: 0,month,amount
0,2024-06,300
1,2024-07,450
2,2024-08,600
3,2024-09,400


## 5. Calculate a rolling average of transaction amounts over a specified window of days.

In [29]:
import pandas as pd

# Sample data
data = {
    'transaction_id': [101, 102, 103, 104, 105, 106, 107],
    'transaction_date': ['2024-06-01', '2024-06-15', '2024-07-01', '2024-07-15', '2024-08-01', '2024-08-15', '2024-09-01'],
    'amount': [100, 200, 150, 300, 250, 350, 400]
}

# Creating DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print("Original DataFrame:")
print(df)

# Task: Convert 'transaction_date' to datetime and calculate a 3-day rolling average of the 'amount' column
df['transaction_date'] = pd.to_datetime(df['transaction_date'])
df['rolling_avg'] = df['amount'].rolling(3).mean()
df

Original DataFrame:
   transaction_id transaction_date  amount
0             101       2024-06-01     100
1             102       2024-06-15     200
2             103       2024-07-01     150
3             104       2024-07-15     300
4             105       2024-08-01     250
5             106       2024-08-15     350
6             107       2024-09-01     400


Unnamed: 0,transaction_id,transaction_date,amount,rolling_avg
0,101,2024-06-01,100,
1,102,2024-06-15,200,
2,103,2024-07-01,150,150.0
3,104,2024-07-15,300,216.666667
4,105,2024-08-01,250,233.333333
5,106,2024-08-15,350,300.0
6,107,2024-09-01,400,333.333333


## 6. Resample a DataFrame to aggregate data by month.

In [34]:
import pandas as pd

# Sample data
data = {
    'transaction_id': [101, 102, 103, 104, 105, 106, 107],
    'transaction_date': ['2024-06-01', '2024-06-15', '2024-07-01', '2024-07-15', '2024-08-01', '2024-08-15', '2024-09-01'],
    'amount': [100, 200, 150, 300, 250, 350, 400]
}

# Creating DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print("Original DataFrame:")
print(df)

# Task: Convert 'transaction_date' to datetime, set it as index, and resample data to aggregate the total amount by month
df['transaction_date'] = pd.to_datetime(df['transaction_date'])
grouper = pd.Grouper(key='transaction_date', freq='ME')
df = df.groupby(grouper).agg({'transaction_id':'sum', 'amount':'sum'})
df

Original DataFrame:
   transaction_id transaction_date  amount
0             101       2024-06-01     100
1             102       2024-06-15     200
2             103       2024-07-01     150
3             104       2024-07-15     300
4             105       2024-08-01     250
5             106       2024-08-15     350
6             107       2024-09-01     400


Unnamed: 0_level_0,transaction_id,amount
transaction_date,Unnamed: 1_level_1,Unnamed: 2_level_1
2024-06-30,203,300
2024-07-31,207,450
2024-08-31,211,600
2024-09-30,107,400


## 7. Create lagged features for time series data to use in predictive modeling.



In [35]:
import pandas as pd

# Sample data
data = {
    'transaction_id': [101, 102, 103, 104, 105, 106, 107],
    'transaction_date': ['2024-06-01', '2024-06-15', '2024-07-01', '2024-07-15', '2024-08-01', '2024-08-15', '2024-09-01'],
    'amount': [100, 200, 150, 300, 250, 350, 400]
}

# Creating DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print("Original DataFrame:")
print(df)

# Task: Convert 'transaction_date' to datetime, set it as index, and create a lagged feature for 'amount'
df['transaction_date'] = pd.to_datetime(df['transaction_date'])
df['amount_lagged'] = df['amount'].shift(1)
df

Original DataFrame:
   transaction_id transaction_date  amount
0             101       2024-06-01     100
1             102       2024-06-15     200
2             103       2024-07-01     150
3             104       2024-07-15     300
4             105       2024-08-01     250
5             106       2024-08-15     350
6             107       2024-09-01     400


Unnamed: 0,transaction_id,transaction_date,amount,amount_lagged
0,101,2024-06-01,100,
1,102,2024-06-15,200,100.0
2,103,2024-07-01,150,200.0
3,104,2024-07-15,300,150.0
4,105,2024-08-01,250,300.0
5,106,2024-08-15,350,250.0
6,107,2024-09-01,400,350.0


## 8. Handle missing dates in a time series and impute missing values.

In [77]:
import pandas as pd

# Sample data with missing dates
data = {
    'transaction_id': [101, 102, 103, 104, 105],
    'transaction_date': ['2024-06-01', '2024-06-15', '2024-07-01', '2024-07-15', '2024-08-01'],
    'amount': [100, 200, 150, 300, 250]
}

# Creating DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print("Original DataFrame:")
print(df)

# Task: Convert 'transaction_date' to datetime, set it as index, reindex to fill missing dates, and impute missing values
df['transaction_date'] = pd.to_datetime(df['transaction_date'], format = "%Y-%m-%d")
dates = pd.date_range(df['transaction_date'].min(), df['transaction_date'].max())
df.set_index('transaction_date', inplace=True)
df = df.reindex(dates)
df['amount'] = df['amount'].ffill()
df

Original DataFrame:
   transaction_id transaction_date  amount
0             101       2024-06-01     100
1             102       2024-06-15     200
2             103       2024-07-01     150
3             104       2024-07-15     300
4             105       2024-08-01     250


Unnamed: 0,transaction_id,amount
2024-06-01,101.0,100.0
2024-06-02,,100.0
2024-06-03,,100.0
2024-06-04,,100.0
2024-06-05,,100.0
...,...,...
2024-07-28,,300.0
2024-07-29,,300.0
2024-07-30,,300.0
2024-07-31,,300.0


## 9. Convert a DataFrame's date column to UTC and then to a specific time zone.

In [44]:
import pandas as pd

# Sample data with time zones
data = {
    'transaction_id': [101, 102, 103, 104, 105],
    'transaction_date': ['2024-06-01 10:00:00', '2024-06-15 12:00:00', '2024-07-01 15:00:00', '2024-07-15 09:00:00', '2024-08-01 17:00:00']
}

# Creating DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print("Original DataFrame:")
print(df)

# Task: Convert 'transaction_date' to datetime, set to UTC, and convert to a specific time zone (e.g., US/Eastern)
df['transaction_date'] = pd.to_datetime(df['transaction_date'])
df['transaction_date_utc'] = df['transaction_date'].dt.tz_localize('UTC')
df['transaction_date_est'] = df['transaction_date'].dt.tz_localize('UTC').dt.tz_convert('US/Eastern')
df

Original DataFrame:
   transaction_id     transaction_date
0             101  2024-06-01 10:00:00
1             102  2024-06-15 12:00:00
2             103  2024-07-01 15:00:00
3             104  2024-07-15 09:00:00
4             105  2024-08-01 17:00:00


Unnamed: 0,transaction_id,transaction_date,transaction_date_utc,transaction_date_est
0,101,2024-06-01 10:00:00,2024-06-01 10:00:00+00:00,2024-06-01 06:00:00-04:00
1,102,2024-06-15 12:00:00,2024-06-15 12:00:00+00:00,2024-06-15 08:00:00-04:00
2,103,2024-07-01 15:00:00,2024-07-01 15:00:00+00:00,2024-07-01 11:00:00-04:00
3,104,2024-07-15 09:00:00,2024-07-15 09:00:00+00:00,2024-07-15 05:00:00-04:00
4,105,2024-08-01 17:00:00,2024-08-01 17:00:00+00:00,2024-08-01 13:00:00-04:00


## 10. Convert Unix Epoch time to datetime and perform operations using time deltas.

In [60]:
import pandas as pd
import datetime as dt
# Sample data with Unix Epoch time
data = {
    'event_id': [1, 2, 3, 4, 5],
    'event_timestamp': [1716115200, 1717228800, 1718342400, 1719456000, 1720569600]  # Unix Epoch time
}

# Creating DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print("Original DataFrame:")
print(df)

# Task: Convert 'event_timestamp' to datetime and calculate time delta to a specific date (e.g., '2024-06-01')
df['event_date'] = pd.to_datetime(df['event_timestamp'], unit='s')
df['time_delta'] = (df['event_date'] -  pd.Timestamp("2024-06-01"))
df

Original DataFrame:
   event_id  event_timestamp
0         1       1716115200
1         2       1717228800
2         3       1718342400
3         4       1719456000
4         5       1720569600


Unnamed: 0,event_id,event_timestamp,event_date,time_delta
0,1,1716115200,2024-05-19 10:40:00,-13 days +10:40:00
1,2,1717228800,2024-06-01 08:00:00,0 days 08:00:00
2,3,1718342400,2024-06-14 05:20:00,13 days 05:20:00
3,4,1719456000,2024-06-27 02:40:00,26 days 02:40:00
4,5,1720569600,2024-07-10 00:00:00,39 days 00:00:00


## 11. Use date offsets to manipulate and analyze time series data.

In [65]:
import pandas as pd

# Sample data
data = {
    'transaction_id': [101, 102, 103, 104, 105],
    'transaction_date': ['2024-06-01', '2024-06-15', '2024-07-01', '2024-07-15', '2024-08-01'],
    'amount': [100, 200, 150, 300, 250]
}

# Creating DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print("Original DataFrame:")
print(df)

# Task: Convert 'transaction_date' to datetime and use date offsets to add and subtract time periods
df['transaction_date'] = pd.to_datetime(df['transaction_date'])
df['transaction_date_plus_1m'] = df['transaction_date'] + pd.DateOffset(months=1)
df['transaction_date_minus_15d'] = df['transaction_date'] - pd.Timedelta(days=15)
df

Original DataFrame:
   transaction_id transaction_date  amount
0             101       2024-06-01     100
1             102       2024-06-15     200
2             103       2024-07-01     150
3             104       2024-07-15     300
4             105       2024-08-01     250


Unnamed: 0,transaction_id,transaction_date,amount,transaction_date_plus_1m,transaction_date_minus_15d
0,101,2024-06-01,100,2024-07-01,2024-05-17
1,102,2024-06-15,200,2024-07-15,2024-05-31
2,103,2024-07-01,150,2024-08-01,2024-06-16
3,104,2024-07-15,300,2024-08-15,2024-06-30
4,105,2024-08-01,250,2024-09-01,2024-07-17


## 12. Generate a range of dates and create a time series dataset.

In [68]:
import pandas as pd
import numpy as np
# Task: Generate a date range from '2024-06-01' to '2024-06-10' with a daily frequency
date_range = pd.date_range(start='2024-06-01', end='2024-06-10', freq='D')

# Display the generated date range
print("Generated Date Range:")
print(date_range)

# Task: Create a DataFrame using the generated date range and add a column with random data
#df['random_data'] = np.random.randint(10, 100, size=df.shape[0])
df = pd.DataFrame(date_range, columns = ["date"])
df['random_data'] = pd.Series([pd.Series(range(10, 100)).sample(1).values[0] for x in range(len(df))])
df

Generated Date Range:
DatetimeIndex(['2024-06-01', '2024-06-02', '2024-06-03', '2024-06-04',
               '2024-06-05', '2024-06-06', '2024-06-07', '2024-06-08',
               '2024-06-09', '2024-06-10'],
              dtype='datetime64[ns]', freq='D')


Unnamed: 0,date,random_data
0,2024-06-01,46
1,2024-06-02,46
2,2024-06-03,16
3,2024-06-04,13
4,2024-06-05,28
5,2024-06-06,25
6,2024-06-07,16
7,2024-06-08,68
8,2024-06-09,39
9,2024-06-10,23


# CHALLENGE: Analyzing Sales Data with Date and Time Functions

Analyze a sales dataset by applying multiple date and time functions, including converting date strings, calculating time differences, filtering by date range, resampling data, and generating date ranges for forecasting.

**Instructions:**

* Convert Date Strings to Datetime Objects: You have a dataset with sales transactions. First, convert the sale_date column from string format to datetime objects.
* Calculate Days Since First Sale: Calculate the number of days since the first sale for each transaction.
* Filter Sales Data: Filter the dataset to include only transactions from the last 30 days.
* Resample Data by Week: Aggregate the total sales amount by week.
* Generate Future Dates for Forecasting: Create a future date range for the next 4 weeks and simulate future sales data based on historical trends.

In [115]:
import pandas as pd

# Sample sales data
data = {
    'sale_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'sale_date': ['2024-06-01', '2024-06-05', '2024-06-10', '2024-06-15', '2024-06-20',
                  '2024-06-25', '2024-07-01', '2024-07-05', '2024-07-10', '2024-07-15'],
    'amount': [100, 150, 200, 250, 300, 350, 400, 450, 500, 550]
}

# Creating DataFrame
df = pd.DataFrame(data)

# Display the DataFrame
print("Original DataFrame:")
print(df)

# Task 1: Convert 'sale_date' to datetime
# Hint: Use pd.to_datetime() to convert the 'sale_date' column to datetime objects
df['sale_date'] = pd.to_datetime(df['sale_date'])
# Task 2: Calculate days since first sale
# Hint: Subtract the minimum sale date from each sale date and use .dt.days to get the difference in days
df['days_since_first'] = (df['sale_date'] -  df['sale_date'].min()).dt.days
df
# Task 3: Filter sales data from the last 30 days
# Hint: Find the maximum date in the dataset and subtract 30 days to get the start date, then filter the DataFrame
mask = (df['sale_date'] >= (df['sale_date'].max() - pd.Timedelta(days=30)))
df2 = df.loc[mask]
df2
# Task 4: Resample data by week and aggregate total sales
# Hint: Use .resample() with 'W-Mon' frequency and aggregate the 'amount' column using .sum()
# Task: Convert 'transaction_date' to datetime, set it as index, and resample data to aggregate the total amount by month

grouper = pd.Grouper(key='sale_date', freq='W-Mon')
df2 = df2.groupby(grouper).agg({'amount':'sum'})
df2.reset_index(inplace=True)
df2
# Task 5: Generate future dates for the next 4 weeks and simulate future sales
# Hint: Use pd.date_range() to create future dates, then simulate future sales by averaging past weekly sales
future_four = pd.date_range(start= df2['sale_date'].max(), freq='W-Mon', periods=5)[1:]
avg = int(df2['amount'].mean())
data2 = {'sale_date': future_four, 'amount': [avg for i in range(len(future_four))]}
df3 = pd.DataFrame(data2)

final = pd.concat([df2, df3], axis=0)
final
#df.resample('ME').sum()
#df.resample('D').ffill()

Original DataFrame:
   sale_id   sale_date  amount
0        1  2024-06-01     100
1        2  2024-06-05     150
2        3  2024-06-10     200
3        4  2024-06-15     250
4        5  2024-06-20     300
5        6  2024-06-25     350
6        7  2024-07-01     400
7        8  2024-07-05     450
8        9  2024-07-10     500
9       10  2024-07-15     550


Unnamed: 0,sale_date,amount
0,2024-06-17,250
1,2024-06-24,300
2,2024-07-01,750
3,2024-07-08,450
4,2024-07-15,1050
0,2024-07-22,560
1,2024-07-29,560
2,2024-08-05,560
3,2024-08-12,560


# Challenge: Comprehensive Sales Data Analysis and Feature Engineering
Perform a comprehensive analysis on a sales dataset, combining multiple Pandas techniques learned throughout the course. This includes date and time manipulation, merging and concatenating DataFrames, filtering, grouping, handling missing data, and feature 

**Instructions:**

1. **Load Sales and Customer Data**: You have two datasets, one with sales transactions and another with customer information. Load both datasets into separate DataFrames.
2. **Merge DataFrames**: Merge the sales and customer DataFrames on a common key (e.g., `customer_id`).
3. **Convert Date Strings to Datetime Objects**: Convert the `sale_date` column from string format to datetime objects.
4. **Calculate Sales Metrics**: Calculate the total sales amount for each customer and each product.
5. **Handle Missing Data**: Identify missing values in the `amount` column and use interpolation to fill them.
6. **Feature Engineering**: Create new features such as the year, month, and quarter from the `sale_date` column. Also, split the `customer_name` column into `first_name` and `last_name`.
7. **Group and Filter Data**: Group the data by customer and filter out customers with total sales above a certain threshold.
8. **Resample and Concatenate Data**: Resample the data to get weekly sales totals and concatenate it with future sales projections.
9. **Final Output**: Provide a summary DataFrame with all the calculations and features.

In [36]:
import pandas as pd
import numpy as np

# Sample sales data
sales_data = {
    'sale_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'customer_id': [1, 2, 1, 3, 2, 1, 3, 2, 1, 3],
    'sale_date': ['2024-06-01', '2024-06-05', '2024-06-10', '2024-06-15', '2024-06-20',
                  '2024-06-25', '2024-07-01', '2024-07-05', '2024-07-10', '2024-07-15'],
    'product': ['A', 'B', 'A', 'B', 'A', 'B', 'A', 'B', 'A', 'B'],
    'amount': [100, np.nan, 200, 250, 300, np.nan, 400, 450, 500, 550]
}

# Sample customer data
customer_data = {
    'customer_id': [1, 2, 3],
    'customer_name': ['John Doe', 'Jane Smith', 'Bob Johnson']
}

# Creating DataFrames
sales_df = pd.DataFrame(sales_data)
customer_df = pd.DataFrame(customer_data)

# Display the DataFrames
print("Sales DataFrame:")
print(sales_df)
print("\nCustomer DataFrame:")
print(customer_df)

# Task 1: Merge sales_df and customer_df on 'customer_id'
# Hint: Use pd.merge()

# Task 2: Convert 'sale_date' to datetime
# Hint: Use pd.to_datetime()

# Task 3: Calculate total sales amount for each customer and each product
# Hint: Use groupby() and sum()

# Task 4: Handle missing data in 'amount' using interpolation
# Hint: Use df['amount'].interpolate()

# Task 5: Create new features like year, month, and quarter from 'sale_date'
# Hint: Use dt.year, dt.month, dt.quarter

# Task 6: Split 'customer_name' into 'first_name' and 'last_name'
# Hint: Use str.split()

# Task 7: Group data by customer and filter customers with total sales above a threshold
# Hint: Use groupby() and filter()

# Task 8: Resample data to get weekly sales totals and concatenate with future sales projections
# Hint: Use resample() and pd.concat()

# Task 9: Provide a summary DataFrame with all calculations and features


Summary DataFrame:

Filtered Customers (Total Sales > 500):
  customer_name  amount
0   Bob Johnson  1200.0
1    Jane Smith   900.0
2      John Doe  1150.0

Combined Sales Data (Historical + Future):


Unnamed: 0,sale_date,amount
0,2024-06-03,100.0
1,2024-06-10,350.0
2,2024-06-17,250.0
3,2024-06-24,300.0
4,2024-07-01,750.0
5,2024-07-08,450.0
6,2024-07-15,1050.0
7,2024-07-22,464.285714
8,2024-07-29,464.285714
9,2024-08-05,464.285714
