In [16]:
import pandas as pd
pd.options.display.width = 1000

Data analysis can be defined as the process of inferring insights, discovering useful information, and drawing results from the data at hand. It’s mainly done to support a decision-making process or to explore the data before creating a machine learning model.

**GroupBY**

In [17]:
df = pd.read_csv('sales.csv')
print(df.columns)
print(df.head(5))

gr= df.groupby("product_group")
print("\n****************")
result= gr.sum()
print(type(result))
print(result)
# acess cell result.loc["product_group","column_name"]
print(result.loc["PG1","cost"])
# using iloc
print(result.iloc[0,2])

Index(['product_code', 'product_group', 'stock_qty', 'cost', 'price', 'last_week_sales', 'last_month_sales'], dtype='object')
   product_code product_group  stock_qty    cost    price  last_week_sales  last_month_sales
0          4187           PG2        498  420.76   569.91               13                58
1          4195           PG2        473  545.64   712.41               16                58
2          4204           PG2        968  640.42   854.91               22                88
3          4219           PG2        241  869.69  1034.55               14                45
4          4718           PG2       1401   12.54    26.59               50               285

****************
<class 'pandas.core.frame.DataFrame'>
               product_code  stock_qty      cost     price  last_week_sales  last_month_sales
product_group                                                                                
PG1                  166563      25766   8016.10  11840.90             1

Named agg

In [27]:
print(df.groupby("product_group").agg({"cost":"sum","price":"sum"}))
# naming the columns
# print(df.groupby("product_group").agg(cost_sum=("cost","sum"),price_sum=("price","sum")).sort_values("cost_sum",ascending=False))

result = df.groupby("product_group").agg(cost_sum=("cost","sum"),price_sum=("price","sum")).sort_values("cost_sum",ascending=False)
print("\n****************")
print(type(result))
print(result.head(5))
print(list(result["cost_sum"]))


                   cost     price
product_group                    
PG1             8016.10  11840.90
PG2             6150.50   9373.51
PG3             2233.62   3259.29
PG4             6244.23  10830.65
PG5            17451.49  22209.22
PG6             5856.13   9549.94

****************
<class 'pandas.core.frame.DataFrame'>
               cost_sum  price_sum
product_group                     
PG5            17451.49   22209.22
PG1             8016.10   11840.90
PG4             6244.23   10830.65
PG2             6150.50    9373.51
PG6             5856.13    9549.94
[17451.49000000001, 8016.099999999999, 6244.23, 6150.5, 5856.13, 2233.6199999999994]


Pivot Table

In [49]:
print(df.head(3))

print("\n****************")
#print(df.pivot_table(index="product_group",values="cost",aggfunc="sum").sort_values("cost",ascending=False))

df["week"] = 1
pivot = pd.pivot_table(
    data=df, 
    index="product_group",
    values="cost",
    aggfunc="sum",
    columns="week"
)

print("\n****************")
print(type(pivot))
print(pivot.head(5))
#print(pivot.loc["PG1","cost"])

   product_code product_group  stock_qty    cost   price  last_week_sales  last_month_sales  week
0          4187           PG2        498  420.76  569.91               13                58     1
1          4195           PG2        473  545.64  712.41               16                58     1
2          4204           PG2        968  640.42  854.91               22                88     1

****************

****************
<class 'pandas.core.frame.DataFrame'>
week                  1
product_group          
PG1             8016.10
PG2             6150.50
PG3             2233.62
PG4             6244.23
PG5            17451.49


In [71]:
#url = 'https://github.com/mattharrison/datasets/raw/master/data/'\
#      '2020-jetbrains-python-survey.csv'
#jb = pd.read_csv(url)
#print(jb.head(5))

# create dataframe containing A, B, C and year columns
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 4, 4],
    'B': [5, 6, 7, 8, 9, 10],
    'C': [9, 10, 11, 12, 13, 14],
    'year': [2018, 2018, 2019, 2019, 2018, 2019]
})

print(df.head(10))

pivot = df.pivot_table(
    values=['B', 'C'],
    index=['A'],
    aggfunc=["sum", "mean"],
    columns=['year']
)
print("\n\n****************")
print(pivot.head(5))
print(type(pivot))

print(pivot.loc[1,('mean','B',2018)])

   A   B   C  year
0  1   5   9  2018
1  2   6  10  2018
2  3   7  11  2019
3  4   8  12  2019
4  4   9  13  2018
5  4  10  14  2019


****************
      sum                   mean                 
        B           C          B          C      
year 2018  2019  2018  2019 2018 2019  2018  2019
A                                                
1     5.0   NaN   9.0   NaN  5.0  NaN   9.0   NaN
2     6.0   NaN  10.0   NaN  6.0  NaN  10.0   NaN
3     NaN   7.0   NaN  11.0  NaN  7.0   NaN  11.0
4     9.0  18.0  13.0  26.0  9.0  9.0  13.0  13.0
<class 'pandas.core.frame.DataFrame'>
5.0


Both the cut and qcut functions convert columns with continuous values to categorical columns, but they apply different techniques. 

In [72]:
A = pd.Series([5, 0, 2, 8, 4, 10, 7])

# cut function. A contain value from 0 ==> 10, we cut A into 4 bins
# so the result is 4 bins, each bin contains values from 0 to 2.5, 2.5 to 5.0, 5.0 to 7.5 and 7.5 to 10
# then it map each value in A to the corresponding bin, lower_bound is inclusive, upper_bound is exclusive
A_binned = pd.cut(A, bins=4)

print(A_binned)


0      (2.5, 5.0]
1    (-0.01, 2.5]
2    (-0.01, 2.5]
3     (7.5, 10.0]
4      (2.5, 5.0]
5     (7.5, 10.0]
6      (5.0, 7.5]
dtype: category
Categories (4, interval[float64]): [(-0.01, 2.5] < (2.5, 5.0] < (5.0, 7.5] < (7.5, 10.0]]
