# Using Pandas to work with, and manipulate data


In [None]:
import pandas as pd

In [None]:
df_customers = pd.read_csv('files/customers.csv', index_col=0)

In [None]:
df_customers

## Can we create a mask to search for customers living in Virginia?

Hint, we can use the fancy mask feature discussed in lecture...

In [None]:
# create a query mask for state
query_mask = df_customers['state'] == "VA"

In [None]:
df_customers[query_mask]

## Can we search for customers that live on a "parkway"

In [None]:
query_mask_pkwy = df_customers['address'].str.contains("Pkwy")
query_mask_parkway = df_customers['address'].str.contains("Parkway")

In [None]:
df_customers[query_mask_pkwy | query_mask_parkway]

## Can we serarch for a customer that lives on a Parkway and has first name Thomas??

In [None]:
query_mask_pkwy = df_customers['address'].str.contains("Pkwy")
query_mask_parkway = df_customers['address'].str.contains("Parkway")
query_mask_thomas = df_customers['first_name'] == "Thomas"

In [None]:
df_customers[(query_mask_pkwy | query_mask_parkway) & query_mask_thomas]

## Let's bring in another data source to work with....

In [None]:
df_orders = pd.read_csv('files/orders.csv', index_col=0)

In [None]:
df_orders

In [None]:
df_customers

How do we combine these together to get a table of orders with the customer infomration?

In [None]:

pd.merge(df_customers, df_orders, on='customer_id')

In [None]:
pd.merge(df_customers, df_orders, left_on='customer_id', right_on='customer_id')

## Let's bring in a third dataframe for some fun...

In [None]:
df_returns = pd.read_csv('files/returns.csv', index_col=0)
df_returns

## Let's start simple... how do we get a list of returns with customer infomration?

In [None]:

pd.merge(df_customers, df_returns, left_index=True, right_on='return_customer_id')

## Now wht if we want to see a "full merge" where get all the orders and returns even when there is no overlap?

In [None]:
pd.merge(df_orders, df_returns, on='order_id', how='outer')

In [None]:
df_combined = pd.merge(df_orders, df_returns, on='order_id', how='outer', suffixes=('_orders', '_returns'))
df_combined

## But wait a minute!  What happened to values!  We have a lot of garbage and weird data types now!

### How can we change back the customer ID to an integer?

In [None]:
df_combined['customer_id_clean'] = (df_combined['customer_id'] > 0).astype('int32')
df_combined

### We can also use regular expressions to clean up dataframes!!!  Here we are going to change the string values for he dollar amounts to floating numbers!

In [None]:
df_combined['amount_returns'] = df_combined['amount_returns'].replace({'\$': '', ',': ''}, regex=True).astype(float)
df_combined['amount_orders'] = df_combined['amount_orders'].replace({'\$': '', ',': ''}, regex=True).astype(float)
df_combined

### We can also change the string version of the data time to actual date time values!

In [None]:
df_combined['order_date_orders'] = df_combined['order_date_orders'].astype('datetime64[ns]')
df_combined['order_date_returns'] = df_combined['order_date_returns'].astype('datetime64[ns]')
df_combined

In [None]:
df_combined.sort_index()

## Now that we have our column types clearn up, can we create a new column that gives us the days between the order date and return date?

In [None]:

df_combined['days_between'] = df_combined['order_date_returns'] - df_combined['order_date_orders']
df_combined

### Let's say for the days between order and return the store is earning interest on the money from the sale (before it is returned).  Let's say the store earns a daily periodic rate of 0.0043%. How much money would they earn holding on to customer's money?

hint!  Just like with the str dataframe methods we need to use the [datatime methods](https://pandas.pydata.org/pandas-docs/version/1.0.1/reference/api/pandas.Series.dt.html?highlight=dt#pandas.Series.dt).

In [None]:
periodic_rate_daily = 0.000043

s_interest = df_combined['amount_orders'] * df_combined['days_between'].dt.days * periodic_rate_daily
s_interest

In [None]:
s_interest.sum()