In [148]:
old_lst = [1, 2, 3, 4, 5, 6, 7]
new_lst = [*old_lst, 10, 15, 20]
new_lst

[1, 2, 3, 4, 5, 6, 7, 10, 15, 20]

In [149]:
old_lst = [1, 2, 3, 4, 5, 6, 7]
new_lst = old_lst + [10, 15, 20]
new_lst

[1, 2, 3, 4, 5, 6, 7, 10, 15, 20]

----
#### MySQL: "Sales" table
    Tidy Data practice

In [150]:
import pandas as pd
from env import user, host, password, get_connection


In [151]:
url = get_connection(user, password, host, "tidy_data")
query = ''' 
SELECT * 
FROM sales
'''


In [152]:
sales_df = pd.read_sql(query, url)


In [153]:
# let's check the dataframe imported correctly 
sales_df.head()

Unnamed: 0,Product,2016 Sales,2016 PPU,2017 Sales,2017 PPU,2018 Sales,2018 PPU
0,A,673,5,231,7,173,9
1,B,259,3,748,5,186,8
2,C,644,3,863,5,632,5
3,D,508,9,356,11,347,14


In [154]:
# Initial observations from the "sales" dataset include:
# Multi-column names (e.g., 2016 Sales...could be split into: "2016" and "Sales")
# Year, Sale, and PPU can be made into three (3) distinct columns

In [155]:
# let's check the shape of the dataset and subseuently, the info. 

sales_df.shape 
# 4 rows by
# 7 column dataset

(4, 7)

In [156]:
sales_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Product     4 non-null      object
 1   2016 Sales  4 non-null      int64 
 2   2016 PPU    4 non-null      int64 
 3   2017 Sales  4 non-null      int64 
 4   2017 PPU    4 non-null      int64 
 5   2018 Sales  4 non-null      int64 
 6   2018 PPU    4 non-null      int64 
dtypes: int64(6), object(1)
memory usage: 352.0+ bytes


In [157]:
sales_df.head()

Unnamed: 0,Product,2016 Sales,2016 PPU,2017 Sales,2017 PPU,2018 Sales,2018 PPU
0,A,673,5,231,7,173,9
1,B,259,3,748,5,186,8
2,C,644,3,863,5,632,5
3,D,508,9,356,11,347,14


-----

In [158]:
# let's now melt the table; keeping the "Product" column as the id_vars parameter
# id_vars
# var_name
# value_name

sales_long = sales_df.melt(
    id_vars = "Product", \
        var_name = "year_and_feature", \
            value_name = "count")

In [159]:
# let's now check the "head" of the new long table

sales_long.head(15) # checks out!

Unnamed: 0,Product,year_and_feature,count
0,A,2016 Sales,673
1,B,2016 Sales,259
2,C,2016 Sales,644
3,D,2016 Sales,508
4,A,2016 PPU,5
5,B,2016 PPU,3
6,C,2016 PPU,3
7,D,2016 PPU,9
8,A,2017 Sales,231
9,B,2017 Sales,748


In [160]:
# let's create year_and_attribute into 2 distinct columns by splitting on the "space"

sales_long[["year", "feature"]] = sales_long["year_and_feature"].str.split(" ", expand = True)

In [161]:
sales_long.head() # checks out

Unnamed: 0,Product,year_and_feature,count,year,feature
0,A,2016 Sales,673,2016,Sales
1,B,2016 Sales,259,2016,Sales
2,C,2016 Sales,644,2016,Sales
3,D,2016 Sales,508,2016,Sales
4,A,2016 PPU,5,2016,PPU


In [162]:
# i can now drop the "year_and_feature" column and reorder the table to read more clearly

sales_long = sales_long.drop(columns = "year_and_feature")

In [163]:
sales_long.head()

Unnamed: 0,Product,count,year,feature
0,A,673,2016,Sales
1,B,259,2016,Sales
2,C,644,2016,Sales
3,D,508,2016,Sales
4,A,5,2016,PPU


In [164]:
sales_long.columns

Index(['Product', 'count', 'year', 'feature'], dtype='object')

In [165]:
# reordering the columns

sales_long = sales_long[['Product', 'year', 'feature', 'count']]

In [166]:
sales_long.head()

Unnamed: 0,Product,year,feature,count
0,A,2016,Sales,673
1,B,2016,Sales,259
2,C,2016,Sales,644
3,D,2016,Sales,508
4,A,2016,PPU,5


In [167]:
sales_long.shape

(24, 4)

In [168]:
# let's rename the "feature" column to "measurement" (more descriptive of what this represents)

sales_long.rename(columns = {"feature": "measurement"}, inplace = True)

In [169]:
sales_long.head() # checks out!

Unnamed: 0,Product,year,measurement,count
0,A,2016,Sales,673
1,B,2016,Sales,259
2,C,2016,Sales,644
3,D,2016,Sales,508
4,A,2016,PPU,5


In [170]:
# capitalizing first letter of column headers
sales_long.columns = sales_long.columns.str.capitalize()

In [171]:
# what if we want to look at ea. product by year and specific "measurement" count

sales_pivot = sales_long.pivot_table(index = ["Product", "Year"], columns = "Measurement", values = "Count")

In [172]:
sales_pivot.head() # checks out! 

# where now we have the product, year, and counts for both PPU and sales for that given year

Unnamed: 0_level_0,Measurement,PPU,Sales
Product,Year,Unnamed: 2_level_1,Unnamed: 3_level_1
A,2016,5,673
A,2017,7,231
A,2018,9,173
B,2016,3,259
B,2017,5,748
