# Data Wrangling or Data Munging

Here we will concentrate on the following sub-sections of this methodology:

1> Data collection: To understand different data retrieval mechanisms for 
                    different data types. -> in very brief

2> Data description: To understand various attributes and properties of the
                     data collected. -> in very brief

3> Data wrangling: To prepare data for consumption in the modeling steps.

4> Data visualization: To visualize different attributes for sharing results, better understanding, and so on.  -> "Covered through matplotlib"

In [1]:
# import required libraries
import random
import datetime 
import numpy as np
import pandas as pd
from random import randrange
from sklearn import preprocessing

from IPython.display import display

pd.options.mode.chained_assignment = None
import warnings; warnings.simplefilter('ignore')  # to suppress warnings

## Utilities

In [2]:
def _random_date(start,date_count):
    """This function generates a random date based on params
    Args:
        start (date object): the base date
        date_count (int): number of dates to be generated
    Returns:
        list of random dates

    """
    current = start
    while date_count > 0:
        curr = current + datetime.timedelta(days=randrange(42))
        yield curr
        date_count-=1


def generate_sample_data(row_count=100):
    """This function generates a random transaction dataset
    Args:
        row_count (int): number of rows for the dataframe
    Returns:
        a pandas dataframe

    """
    
    # sentinels
    startDate = datetime.datetime(2016, 1, 1,13)
    serial_number_sentinel = 1000
    user_id_sentinel = 5001
    product_id_sentinel = 101
    price_sentinel = 2000
    
    
    # base list of attributes
    data_dict = {
    'Serial No': np.arange(row_count)+serial_number_sentinel,
    'Date': np.random.permutation(pd.to_datetime([x.strftime("%d-%m-%Y") 
                                                    for x in _random_date(startDate,
                                                                          row_count)]).date
                                  ),
    'User ID': np.random.permutation(np.random.randint(0,
                                                       row_count,
                                                       size=int(row_count/10)) + user_id_sentinel).tolist()*10,
    'Product ID': np.random.permutation(np.random.randint(0,
                                                          row_count,
                                                          size=int(row_count/10))+ product_id_sentinel).tolist()*10 ,
    'Quantity Purchased': np.random.permutation(np.random.randint(1,
                                                                  42,
                                                                  size=row_count)),
    'Price': np.round(np.abs(np.random.randn(row_count)+1)*price_sentinel,
                      decimals=2),
    'User Type':np.random.permutation([chr(random.randrange(97, 97 + 3 + 1)) 
                                            for i in range(row_count)])
    }
    
    # introduce missing values
    for index in range(int(np.sqrt(row_count))): 
        data_dict['Price'][np.argmax(data_dict['Price'] == random.choice(data_dict['Price']))] = np.nan
        data_dict['User Type'][np.argmax(data_dict['User Type'] == random.choice(data_dict['User Type']))] = np.nan
        data_dict['Date'][np.argmax(data_dict['Date'] == random.choice(data_dict['Date']))] = np.nan
        data_dict['Product ID'][np.argmax(data_dict['Product ID'] == random.choice(data_dict['Product ID']))] = 0
        data_dict['Serial No'][np.argmax(data_dict['Serial No'] == random.choice(data_dict['Serial No']))] = -1
        data_dict['User ID'][np.argmax(data_dict['User ID'] == random.choice(data_dict['User ID']))] = -101
        
    
    # create data frame
    df = pd.DataFrame(data_dict)
    
    return df
    

def describe_dataframe(df=pd.DataFrame()):
    """This function generates descriptive stats of a dataframe
    Args:
        df (dataframe): the dataframe to be analyzed
    Returns:
        None

    """
    print("\n\n")
    print("*"*30)
    print("About the Data")
    print("*"*30)
    
    print("Number of rows::",df.shape[0])
    print("Number of columns::",df.shape[1])
    print("\n")
    
    print("Column Names::",df.columns.values.tolist())
    print("\n")
    
    print("Column Data Types::\n",df.dtypes)
    print("\n")
    
    print("Columns with Missing Values::",df.columns[df.isnull().any()].tolist())
    print("\n")
    
    print("Number of rows with Missing Values::",len(pd.isnull(df).any(1).nonzero()[0].tolist()))
    print("\n")
    
    print("Sample Indices with missing data::",pd.isnull(df).any(1).nonzero()[0].tolist()[0:5])
    print("\n")
    
    print("General Stats::")
    print(df.info())
    print("\n")
    
    print("Summary Stats::")
    print(df.describe())
    print("\n")
    
    print("Dataframe Sample Rows::")
    display(df.head(5))
    
def cleanup_column_names(df,rename_dict={},do_inplace=True):
    """This function renames columns of a pandas dataframe
       It converts column names to snake case if rename_dict is not passed. 
    Args:
        rename_dict (dict): keys represent old column names and values point to 
                            newer ones
        do_inplace (bool): flag to update existing dataframe or return a new one
    Returns:
        pandas dataframe if do_inplace is set to False, None otherwise

    """
    if not rename_dict:
        return df.rename(columns={col: col.lower().replace(' ','_') 
                    for col in df.columns.values.tolist()}, 
                  inplace=do_inplace)
    else:
        return df.rename(columns=rename_dict,inplace=do_inplace)

def expand_user_type(u_type):
    """This function maps user types to user classes
    Args:
        u_type (str): user type value
    Returns:
        (str) user_class value

    """
    if u_type in ['a','b']:
        return 'new'
    elif u_type == 'c':
        return 'existing'
    elif u_type == 'd':
        return 'loyal_existing'
    else:
        return 'error'

## Generate a Sample Dataset

In [3]:
df = generate_sample_data(row_count=1000)
# type your code here

Unnamed: 0,Serial No,Date,User ID,Product ID,Quantity Purchased,Price,User Type
0,-1,,-101,0,30,,n
1,1001,,5801,650,32,218.24,n
2,1002,2016-07-02,5220,318,17,135.59,n
3,-1,2016-01-02,5609,658,27,,n
4,1004,,5030,705,9,2545.91,n
5,1005,2016-02-02,5283,1003,3,1289.71,n
6,1006,,5173,440,8,36.37,n
7,1007,,5077,395,2,125.17,n
8,1008,2016-01-31,5533,968,36,2768.64,n
9,1009,,5279,1081,25,2372.78,n


In [76]:
# type your code here

(1000, 7)

### Describe the Dataset

In [4]:
describe_dataframe(df)




******************************
About the Data
******************************
Number of rows:: 1000
Number of columns:: 7


Column Names:: ['Serial No', 'Date', 'User ID', 'Product ID', 'Quantity Purchased', 'Price', 'User Type']


Column Data Types::
 Serial No               int32
Date                   object
User ID                 int64
Product ID              int64
Quantity Purchased      int32
Price                 float64
User Type              object
dtype: object


Columns with Missing Values:: ['Date', 'Price']


Number of rows with Missing Values:: 60


Sample Indices with missing data:: [0, 1, 3, 4, 6]


General Stats::
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
Serial No             1000 non-null int32
Date                  969 non-null object
User ID               1000 non-null int64
Product ID            1000 non-null int64
Quantity Purchased    1000 non-null int32
Price                 969 non-null float64


Unnamed: 0,Serial No,Date,User ID,Product ID,Quantity Purchased,Price,User Type
0,-1,,-101,0,30,,n
1,1001,,5801,650,32,218.24,n
2,1002,2016-07-02,5220,318,17,135.59,n
3,-1,2016-01-02,5609,658,27,,n
4,1004,,5030,705,9,2545.91,n


### Rename Columns

In [5]:
print("Dataframe columns:\n{}".format(df.columns.tolist()))

Dataframe columns:
['Serial No', 'Date', 'User ID', 'Product ID', 'Quantity Purchased', 'Price', 'User Type']


In [6]:
# type your code here

In [7]:
print("Dataframe columns:\n{}".format(df.columns.tolist()))

Dataframe columns:
['serial_no', 'date', 'user_id', 'product_id', 'quantity_purchased', 'price', 'user_type']


In [8]:
df.head()

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
0,-1,,-101,0,30,,n
1,1001,,5801,650,32,218.24,n
2,1002,2016-07-02,5220,318,17,135.59,n
3,-1,2016-01-02,5609,658,27,,n
4,1004,,5030,705,9,2545.91,n


### Sort Rows on defined attributes

In [9]:
display(df.sort_values(['serial_no','price'],ascending=[True,False]).head())
#display(df.sort_values(['serial_no','price'],ascending=[True,False]).tail())

# first sorted on serial_no, all products having same serial_no sorted on price

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
309,-1,2016-01-25,5279,1081,17,7420.11,c
34,-1,2016-01-24,5307,1063,32,5750.89,b
788,-1,2016-01-19,5831,326,30,3946.99,c
708,-1,2016-07-01,5533,968,28,3775.75,d
700,-1,2016-01-25,5584,405,26,3259.58,d


### Rearrange Columns in a Dataframe

In [10]:
display(df[['serial_no','date','user_id','user_type',
           'product_id','quantity_purchased','price']].head())

Unnamed: 0,serial_no,date,user_id,user_type,product_id,quantity_purchased,price
0,-1,,-101,n,0,30,
1,1001,,5801,n,650,32,218.24
2,1002,2016-07-02,5220,n,318,17,135.59
3,-1,2016-01-02,5609,n,658,27,
4,1004,,5030,n,705,9,2545.91


### Filtering Columns

Using Column Index

In [11]:
df.head()

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
0,-1,,-101,0,30,,n
1,1001,,5801,650,32,218.24,n
2,1002,2016-07-02,5220,318,17,135.59,n
3,-1,2016-01-02,5609,658,27,,n
4,1004,,5030,705,9,2545.91,n


In [12]:
# Exercise Cell, Not a part of NB
# importing pandas as pd 
import pandas as pd 
  
# Creating the DataFrame 
temp_df = pd.DataFrame({'Weight':[45, 88, 56, 15, 71], 
                   'Name':['Sam', 'Andrea', 'Alex', 'Robin', 'Kia'], 
                   'Age':[14, 25, 55, 8, 21]}) 
  
# Print the DataFrame 
# type your code here


# return the numpy representation of  
# this dataframe 
print("-------------------------")
# Print the result 
# type your code here


print("-------------------------")
# Print the result 
print(temp_df['Weight'].values)

print("-------------------------")
# Print the result 
# type your code here


print("-------------------------")
# Print the result 
print(temp_df['Age'].values)

print("-------------------------")
# Print the result 
print(temp_df['Name'].values[0:3])

print("-------------------------")
# Print the result 
# type your code here



   Weight    Name  Age
0      45     Sam   14
1      88  Andrea   25
2      56    Alex   55
3      15   Robin    8
4      71     Kia   21
-------------------------
[[45 'Sam' 14]
 [88 'Andrea' 25]
 [56 'Alex' 55]
 [15 'Robin' 8]
 [71 'Kia' 21]]
-------------------------
[45 88 56 15 71]
-------------------------
['Sam' 'Andrea' 'Alex' 'Robin' 'Kia']
-------------------------
[14 25 55  8 21]
-------------------------
['Sam' 'Andrea' 'Alex']
-------------------------
['Sam' 'Andrea' 'Alex']


In [13]:
# type your code here


Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
0,-1,,-101,0,30,,n
1,1001,,5801,650,32,218.24,n
2,1002,2016-07-02,5220,318,17,135.59,n
3,-1,2016-01-02,5609,658,27,,n
4,1004,,5030,705,9,2545.91,n
5,1005,2016-02-02,5283,1003,3,1289.71,n
6,1006,,5173,440,8,36.37,n
7,1007,,5077,395,2,125.17,n
8,1008,2016-01-31,5533,968,36,2768.64,n
9,1009,,5279,1081,25,2372.78,n


In [14]:
# print 10 values from column at index 3
print(df.iloc[:,3].values[0:10])
# type your comment here



[   0  650  318  658  705 1003  440  395  968 1081]


In [15]:
# print 5 values from column at index 3
print(df.iloc[:,3].values[0:5])

[  0 650 318 658 705]


Using Column Name

In [16]:
# print 10 values of quantity purchased
# type your code here


[30 32 17 27  9  3  8  2 36 25]


In [17]:
df.head()

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
0,-1,,-101,0,30,,n
1,1001,,5801,650,32,218.24,n
2,1002,2016-07-02,5220,318,17,135.59,n
3,-1,2016-01-02,5609,658,27,,n
4,1004,,5030,705,9,2545.91,n


Using Column Datatype

In [18]:
# print 10 values of columns with data type float
print(df.select_dtypes(include=['float64']).values[:10,0])
# we are printing only 0th column, i.e price

[    nan  218.24  135.59     nan 2545.91 1289.71   36.37  125.17 2768.64
 2372.78]


In [19]:
# type your code here



[-101 5801 5220 5609 5030 5283 5173 5077 5533 5279]


In [20]:
print(df.select_dtypes(include=['int64']).values[:10,1]) #product_id

[   0  650  318  658  705 1003  440  395  968 1081]


### Filtering Rows

Select specific rows

In [21]:
display(df.iloc[[10,501,20]])

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
10,1010,2016-01-29,5693,665,6,1061.08,n
501,1501,2016-01-31,5801,650,39,1127.37,a
20,1020,2016-01-27,5577,419,30,1631.13,n


Exclude Specific Row indices

In [22]:
display(df.drop([0,2,5], axis=0).head()) 
# type your comments here





Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
1,1001,,5801,650,32,218.24,n
3,-1,2016-01-02,5609,658,27,,n
4,1004,,5030,705,9,2545.91,n
6,1006,,5173,440,8,36.37,n
7,1007,,5077,395,2,125.17,n


Conditional Filtering

In [23]:
display(df[df.quantity_purchased>25].head())

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
0,-1,,-101,0,30,,n
1,1001,,5801,650,32,218.24,n
3,-1,2016-01-02,5609,658,27,,n
8,1008,2016-01-31,5533,968,36,2768.64,n
13,1013,2016-01-29,5184,583,27,3343.74,n


In [24]:
# type your code here



Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
1,1001,,5801,650,32,218.24,n
2,1002,2016-07-02,5220,318,17,135.59,n
4,1004,,5030,705,9,2545.91,n
5,1005,2016-02-02,5283,1003,3,1289.71,n
6,1006,,5173,440,8,36.37,n


Offset from top of the dataframe

In [25]:
display(df[100:].head())

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
100,1100,2016-01-28,5584,405,20,2570.49,c
101,1101,2016-01-13,5801,650,7,5606.11,c
102,1102,2016-01-27,5220,318,15,3935.87,c
103,1103,2016-01-28,5609,658,31,3909.88,b
104,1104,2016-07-01,5030,705,12,2360.86,d


Offset from bottom of the dataframe

In [26]:
display(df[-10:].head()) # type your comment here


Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
990,1990,2016-01-26,5644,798,27,876.57,c
991,1991,2016-07-02,5800,962,37,3017.64,d
992,1992,2016-01-24,5653,218,36,3829.08,b
993,1993,2016-06-02,5905,1083,41,2187.42,d
994,1994,2016-01-23,5478,116,11,2222.88,d


### TypeCasting/Data Type Conversion

In [27]:
# Exercise Cell, not a part of NB
# importing pandas as pd 
import pandas as pd 

# Creating the dataframe 
d_df = pd.DataFrame({'Date':['11/8/2011', '04/23/2008', '10/2/2019'], 
                    'Event':['Music', 'Poetry', 'Theatre'], 
                    'Cost':[10000, 5000, 15000]}) 

# Print the dataframe 
print(d_df) 

print("--------------------")
# Now we will check the data type 
# of the 'Date' column 
# type your code here


         Date    Event   Cost
0   11/8/2011    Music  10000
1  04/23/2008   Poetry   5000
2   10/2/2019  Theatre  15000
--------------------
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
Date     3 non-null object
Event    3 non-null object
Cost     3 non-null int64
dtypes: int64(1), object(2)
memory usage: 112.0+ bytes


In [28]:
# Exercise Cell, not a part of NB
# convert the 'Date' column to datetime format 
d_df['Date']= pd.to_datetime(d_df['Date']) 

# Check the format of 'Date' column 
d_df.info() 


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
Date     3 non-null datetime64[ns]
Event    3 non-null object
Cost     3 non-null int64
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 124.0+ bytes


In [29]:
df['date'] = pd.to_datetime(df.date)
# compare dtypes of the original df with this one
# type your code here


serial_no                      int32
date                  datetime64[ns]
user_id                        int64
product_id                     int64
quantity_purchased             int32
price                        float64
user_type                     object
dtype: object


### Apply/Map Usage

Map : Create a derived attribute using map. map() works element wise.

In [30]:
df.tail()

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
995,1995,2016-08-01,5078,195,13,4018.09,c
996,1996,2016-04-02,5860,702,13,2529.63,a
997,1997,2016-06-02,5228,115,20,2487.43,b
998,1998,2016-01-02,5587,433,14,4009.84,c
999,1999,2016-07-01,5257,387,11,2599.67,b


In [31]:
df['user_class'] = df['user_type'].map(expand_user_type)
# map function applies the user defn method expand_user_type to each value of 
# user_type
display(df.tail())

# -- expand_user_type function is defn in the Utilities. 
# -- Its body is shown here just for reference.
#  def expand_user_type(u_type):
#   if u_type in ['a','b']:
#   return 'new'
#   elif u_type == 'c':
#   return 'existing'
#   elif u_type == 'd':
#   return 'loyal_existing'
#   else:
#   return 'error'

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class
995,1995,2016-08-01,5078,195,13,4018.09,c,existing
996,1996,2016-04-02,5860,702,13,2529.63,a,new
997,1997,2016-06-02,5228,115,20,2487.43,b,new
998,1998,2016-01-02,5587,433,14,4009.84,c,existing
999,1999,2016-07-01,5257,387,11,2599.67,b,new


Apply: Using apply to get attribute ranges

In [32]:
# The apply() function is used to perform actions on the whole object, 
# depending upon the axis (default is on all rows).
display(df.select_dtypes(include=[np.number]).apply(lambda x: 
                                                        x.max()- x.min()))

# type your code here



serial_no             2000.0
user_id               6089.0
product_id            1098.0
quantity_purchased      40.0
price                 9529.6
dtype: float64

Applymap: Extract week from date

In [33]:
df['purchase_week'] = df[['date']].applymap(lambda dt:dt.week 
                                                if not pd.isnull(dt.week) 
                                                else 0)

# lambda fn gets the week of the transaction from the date attribute

In [34]:
df.head()

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week
0,-1,NaT,-101,0,30,,n,error,0
1,1001,NaT,5801,650,32,218.24,n,error,0
2,1002,2016-07-02,5220,318,17,135.59,n,error,26
3,-1,2016-01-02,5609,658,27,,n,error,53
4,1004,NaT,5030,705,9,2545.91,n,error,0


In [35]:
#to print week from date
df['date'].dt.week
#df.date.dt.week

0       NaN
1       NaN
2      26.0
3      53.0
4       NaN
       ... 
995    31.0
996    13.0
997    22.0
998    53.0
999    26.0
Name: date, Length: 1000, dtype: float64

In [36]:
 display(df.head()) # type your comment here


Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week
0,-1,NaT,-101,0,30,,n,error,0
1,1001,NaT,5801,650,32,218.24,n,error,0
2,1002,2016-07-02,5220,318,17,135.59,n,error,26
3,-1,2016-01-02,5609,658,27,,n,error,53
4,1004,NaT,5030,705,9,2545.91,n,error,0


### Missing Values

Imputing Missing Values : Missing values can lead to all sorts of problems when dealing with Machine Learning and Data Science related use cases. Not only can they cause problems for algorithms, they can mess up calculations and even final outcomes. 

Missing values also pose risk of being interpreted in non-standard ways as well leading to confusion and more errors. Hence, imputing missing values carries a lot of weight in the overall data wrangling process.

One of the easiest ways of handling missing values is to ignore or remove them altogether from the dataset. When the dataset is fairly large and we have enough samples of various types required, this option can be safely exercised. We use the dropna() function from pandas in the following snippet to remove rows of data where the date of transaction is missing.

In [37]:
print("Drop Rows with missing dates::" )
df_dropped = df.dropna(subset=['date'])
display(df_dropped.head())

# # dropna -> drops not available values.i.e naT in this case

Drop Rows with missing dates::


Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week
2,1002,2016-07-02,5220,318,17,135.59,n,error,26
3,-1,2016-01-02,5609,658,27,,n,error,53
5,1005,2016-02-02,5283,1003,3,1289.71,n,error,5
8,1008,2016-01-31,5533,968,36,2768.64,n,error,4
10,1010,2016-01-29,5693,665,6,1061.08,n,error,4


Fill Missing Price values with mean price

In [38]:
# Often dropping rows is a very expensive and unfeasible option. 
# In many scenarios, missing values are imputed using the help of other 
# values in the dataframe. One commonly used trick is to replace missing
# values with a central tendency measure like mean or median.
# fillna -> fills 'not available' values
df_dropped['price'].fillna(value=np.round(df.price.mean(),decimals=2),
                                inplace=True)
# type your code here


2      135.59
3     2273.53
5     1289.71
8     2768.64
10    1061.08
Name: price, dtype: float64

In [39]:
# type your code here


0

Fill Missing user_type values with value from previous row (forward fill) 

In [40]:
print("Fill Missing user_type values with value from previous row (forward fill) ::" )
df_dropped['user_type'].fillna(method='ffill',inplace=True)
df_dropped.head()

Fill Missing user_type values with value from previous row (forward fill) ::


Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week
2,1002,2016-07-02,5220,318,17,135.59,n,error,26
3,-1,2016-01-02,5609,658,27,2273.53,n,error,53
5,1005,2016-02-02,5283,1003,3,1289.71,n,error,5
8,1008,2016-01-31,5533,968,36,2768.64,n,error,4
10,1010,2016-01-29,5693,665,6,1061.08,n,error,4


Fill Missing user_type values with value from next row (backward fill)

In [41]:
df_dropped['user_type'].fillna(method='bfill',inplace=True)
# keeping inplace true writes the new data in the data set itself.

inplace=True   i.e like a++
df-> changes copies to original
thereby new data space not consumed.

------

inplace=False i.e like a=a+1
df-> changes not copied to original
thereby ne data space created.

//here do this kind of coding
dfNew = df.fillna(...,inplace=False)

### Duplicates

Drop Duplicate serial_no rows

In [42]:
df_dropped.shape

(969, 9)

In [43]:
df_dropped.head()

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week
2,1002,2016-07-02,5220,318,17,135.59,n,error,26
3,-1,2016-01-02,5609,658,27,2273.53,n,error,53
5,1005,2016-02-02,5283,1003,3,1289.71,n,error,5
8,1008,2016-01-31,5533,968,36,2768.64,n,error,4
10,1010,2016-01-29,5693,665,6,1061.08,n,error,4


In [44]:
# sample duplicates
# duplicated is a build-in fn
# Lets display the duplicated serial_no
display(df_dropped[df_dropped.duplicated(subset=['serial_no'])].head())
# type your code here


Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week
21,-1,2016-02-02,5421,551,34,2049.11,n,error,5
34,-1,2016-01-24,5307,1063,32,5750.89,b,new,3
66,-1,2016-01-15,5019,312,21,1367.79,d,loyal_existing,2
73,-1,2016-01-28,5682,677,30,3011.89,a,new,4
108,-1,2016-07-02,5533,968,39,443.23,b,new,26


Shape of df=(969, 9)


In [45]:
# drop_duplicates is a build-in function 
df_dropped.drop_duplicates(subset=['serial_no'],inplace=True)

In [46]:
# updated dataframe
display(df_dropped.head())
print("Shape of df={}".format(df_dropped.shape))

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week
2,1002,2016-07-02,5220,318,17,135.59,n,error,26
3,-1,2016-01-02,5609,658,27,2273.53,n,error,53
5,1005,2016-02-02,5283,1003,3,1289.71,n,error,5
8,1008,2016-01-31,5533,968,36,2768.64,n,error,4
10,1010,2016-01-29,5693,665,6,1061.08,n,error,4


Shape of df=(941, 9)


Remove rows which have less than 3 attributes with non-missing data

In [47]:
# there are certain conditions where a record is not much of use 
# if it has more than a certain threshold of attribute values missing. 
# For instance, if in our dataset a transaction has less than three
# attributes as non-null, the transaction might almost be unusable. 
# In such a scenario, it might be advisable to drop that data point itself. 
# We can filter out such data points using the function dropna() 
# with the parameter thresh set to the threshold of non-null attributes

# In short, it removes each row which has more than 3 non-null values
display(df.dropna(thresh=3).head())
print("Shape of df={}".format(df.dropna(thresh=3).shape))

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week
0,-1,NaT,-101,0,30,,n,error,0
1,1001,NaT,5801,650,32,218.24,n,error,0
2,1002,2016-07-02,5220,318,17,135.59,n,error,26
3,-1,2016-01-02,5609,658,27,,n,error,53
4,1004,NaT,5030,705,9,2545.91,n,error,0


Shape of df=(1000, 9)


### Encode Categoricals

One Hot Encoding using get_dummies()

In [48]:
# method to convert the categorical variable into indicator variables 
# use the get_dummies() function.
# type your code here


Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_class,purchase_week,user_type_a,user_type_b,user_type_c,user_type_d,user_type_n
0,-1,NaT,-101,0,30,,error,0,0,0,0,0,1
1,1001,NaT,5801,650,32,218.24,error,0,0,0,0,0,1
2,1002,2016-07-02,5220,318,17,135.59,error,26,0,0,0,0,1
3,-1,2016-01-02,5609,658,27,,error,53,0,0,0,0,1
4,1004,NaT,5030,705,9,2545.91,error,0,0,0,0,0,1


Label Mapping

In [49]:
# using the map() function, where we simply map each value 
# from the allowed set to a numeric value
type_map={'a':0,'b':1,'c':2,'d':3,np.NAN:-1}
df['encoded_user_type'] = df.user_type.map(type_map)
# type your code here


Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week,encoded_user_type
995,1995,2016-08-01,5078,195,13,4018.09,c,existing,31,2.0
996,1996,2016-04-02,5860,702,13,2529.63,a,new,13,0.0
997,1997,2016-06-02,5228,115,20,2487.43,b,new,22,1.0
998,1998,2016-01-02,5587,433,14,4009.84,c,existing,53,2.0
999,1999,2016-07-01,5257,387,11,2599.67,b,new,26,1.0


### Random Sampling data from DataFrame

In [50]:
display(df.sample(frac=0.2, replace=True, random_state=42).head())
# explaination for replace parameter
# when sampling, the records are removed from the orginal dataset, so that
# their is no repetition of samples in the o/p.
# but if the size of the sample is greater than the dataset itself then keep
# replace = true
"""replace=True is like CBV
replace=False is like CBR"""

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week,encoded_user_type
102,1102,2016-01-27,5220,318,15,3935.87,c,existing,4,2.0
435,1435,2016-01-20,5882,1057,36,4950.15,d,loyal_existing,3,3.0
860,1860,2016-10-02,5463,1085,30,701.31,a,new,39,0.0
270,1270,NaT,5510,1091,7,513.53,d,loyal_existing,0,3.0
106,1106,2016-01-29,5173,440,19,1887.81,d,loyal_existing,4,3.0


'replace=True is like CBV\nreplace=False is like CBR'

### Normalizing Numeric Values

Attribute normalization is the process of standardizing the range of values of attributes. Machine learning algorithms in many cases utilize distance metrics, attributes or features of different scales/ranges which might adversely affect the calculations or bias the outcomes. Normalization is also called feature scaling.

Normalize price values using  **Min-Max Scaler**

In [51]:
# Visit this link before you proceed
# https://www.geeksforgeeks.org/python-difference-between-pandas-copy-and-copying-through-variables/
df_normalized = df.dropna().copy() 
min_max_scaler = preprocessing.MinMaxScaler()
np_scaled = min_max_scaler.fit_transform(df_normalized['price'].values.reshape(-1,1))#all scaling functions works on series
df_normalized['price'] = np_scaled.reshape(-1,1)

# reshape(-1,1) works like this :
# -1 means we dont know the no. of rows. hence it would take len(df).
# the 2nd parameter being 1 -> means I want only 1 column
# ex: assume z is 2D numpy array and z.shape is (3,4) 
# z.reshape(-1) would give you a 1D np.array
# now z.shape would be (12,)  like 
# array([1,2,3,4,5,6,7,8,9,10,11,12])
# and reshaping it as z.reshape(-1,1) would give us (12,1). 
# i.e 1 column with all row values. 

In [124]:
display(df_normalized.head())

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week,encoded_user_type
17,1017,2016-04-01,5627,183,15,0.016721,b,new,13,1.0
24,1024,2016-06-01,5459,884,24,0.295898,c,existing,22,2.0
25,1025,2016-05-01,5943,1087,8,0.09108,c,existing,17,2.0
27,1027,2016-01-17,5728,951,7,0.02096,b,new,2,1.0
28,1028,2016-01-18,5640,230,17,0.491741,c,existing,3,2.0


Normalize quantity purchased values using  **Robust Scaler**

In [52]:
df_normalized = df.dropna().copy()
robust_scaler = preprocessing.RobustScaler()
rs_scaled = robust_scaler.fit_transform(df_normalized['quantity_purchased'].values.reshape(-1,1))
# type your code here


# interested people may research the maths behind min-max scaler and
# robust_scaler. 

In [53]:
display(df_normalized.head())

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type,user_class,purchase_week,encoded_user_type
16,1016,2016-01-18,5790,984,-0.428571,3210.53,c,existing,3,2.0
26,1026,2016-01-19,5221,685,-0.380952,2372.25,b,new,3,1.0
28,1028,2016-01-22,5475,734,-0.761905,1436.85,c,existing,3,2.0
29,1029,2016-05-01,5669,800,0.714286,5418.46,d,loyal_existing,17,3.0
32,1032,2016-03-01,5164,582,0.238095,3630.39,c,existing,9,2.0


### Data Summarization

Data summarization refers to the process of preparing a compact representation of raw data at hand. This process involves aggregation of data using different statistical, mathematical, and other methods. Summarization is helpful for visualization, compressing raw data, and better understanding of its attributes.

Condition based aggregation

In [54]:
print("Mean price of items purchased by user_type=a :: {}".format(df['price'][df['user_type']=='a'].mean()))

Mean price of items purchased by user_type=a :: 2275.8404910714285


Condtion based counts

In [55]:
print(df['purchase_week'].value_counts())
# counts the number of transactions per week

4     171
3     155
2     128
26     56
9      53
35     52
53     48
44     46
39     45
31     43
5      41
13     40
22     38
0      31
17     21
18     17
48     15
Name: purchase_week, dtype: int64


### Group By

Group By certain attributes

In [56]:
print(df.groupby(['user_class'])['quantity_purchased'].sum())
# This statement generates a tabular output representing 
# sum of quantities purchased by each user_class.

user_class
error               569
existing           5100
loyal_existing     5077
new               10351
Name: quantity_purchased, dtype: int32


Group By with different aggregate functions

In [57]:
# The groupby() function is a powerful interface that allows us 
# to perform complex groupings and aggregations.
# With groupby() we can perform multi-attribute groupings 
# and apply multiple aggregations across attributes.

# variant-1: multiple aggregations on single attribute
display(df.groupby(['user_class'])['quantity_purchased'].agg([np.sum,
                                                                np.mean,
                                                                np.count_nonzero]))

Unnamed: 0_level_0,sum,mean,count_nonzero
user_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
error,569,18.354839,31
existing,5100,21.161826,241
loyal_existing,5077,20.722449,245
new,10351,21.430642,483


Group by specific aggregate functions for each attribute

In [58]:
# variant-2: different aggregation functions for each attribute
display(df.groupby(['user_class','user_type']).agg({'price':np.mean,
                                                        'quantity_purchased':np.max}))

Unnamed: 0_level_0,Unnamed: 1_level_0,price,quantity_purchased
user_class,user_type,Unnamed: 2_level_1,Unnamed: 3_level_1
error,n,2194.714286,39
existing,c,2241.8897,41
loyal_existing,d,2312.115443,41
new,a,2275.840491,41
new,b,2273.187814,41


Group by with multiple agg for each attribute

In [59]:
# Variant 3: Here, we do a combination of variants 1 and 2, 
# i.e., we apply multiple aggregations on the price field while 
# applying only a single one on quantity_purchased. 
# Note : a dictionary is passed, as shown in the snippet.
display(df.groupby(['user_class','user_type']).agg({'price':{  'total_price':np.sum,
                                                                'mean_price':np.mean,
                                                                'variance_price':np.std,
                                                                'count':np.count_nonzero},
                                                   'quantity_purchased':np.sum}))  

Unnamed: 0_level_0,Unnamed: 1_level_0,price,price,price,price,quantity_purchased
Unnamed: 0_level_1,Unnamed: 1_level_1,total_price,mean_price,variance_price,count,sum
user_class,user_type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
error,n,61452.0,2194.714286,1590.653715,31.0,569
existing,c,522360.3,2241.8897,1658.993006,241.0,5100
loyal_existing,d,547971.36,2312.115443,1571.826738,245.0,5077
new,a,509788.27,2275.840491,1491.190211,230.0,5024
new,b,561477.39,2273.187814,1563.262156,253.0,5327


### Pivot Tables

In [60]:
#HW: Remember its same as stack() and unstack()
display(df.groupby(['user_class','user_type']).agg({'price':{  'total_price':np.sum,
                                                                'mean_price':np.mean,
                                                                'variance_price':np.std,
                                                                'count':np.count_nonzero},
                                                   'quantity_purchased':np.sum}))  

# pivot table shows us comprehensive information of mean price 
# date-wise , user_type wise

Unnamed: 0_level_0,Unnamed: 1_level_0,price,price,price,price,quantity_purchased
Unnamed: 0_level_1,Unnamed: 1_level_1,total_price,mean_price,variance_price,count,sum
user_class,user_type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
error,n,61452.0,2194.714286,1590.653715,31.0,569
existing,c,522360.3,2241.8897,1658.993006,241.0,5100
loyal_existing,d,547971.36,2312.115443,1571.826738,245.0,5077
new,a,509788.27,2275.840491,1491.190211,230.0,5024
new,b,561477.39,2273.187814,1563.262156,253.0,5327
