In [1]:
# Import required modules
import pandas as pd
import glob
import numpy as np
from agefromname import AgeFromName
from concurrent.futures import ProcessPoolExecutor
# from ethnicolr import census_ln, pred_census_ln

In [2]:
# All the files to read. Unzip "allProviders.zip"
filesToRead = glob.glob("/home/faysal/Desktop/order-data-analysis/allProviders/*.csv")

# Read in the data
df = pd.concat([pd.read_csv(file, dtype="unicode") for file in filesToRead])

# Remove unnamed  and unnecescolumns
df = df.loc[:, ~df.columns.str.contains("Unnamed")]

# Preview the data
df.head()

Unnamed: 0,provider,Order Status,Order Date,Customer Note,First Name (Billing),Last Name (Billing),Company (Billing),Address 1&2 (Billing),City (Billing),State Code (Billing),...,Order Total Amount,Order Total Tax Amount,Item #,Item Name,Quantity,Item Cost,Discount Amount,Discount Amount Tax,Currency,Order Total Amount.1
0,Course Gate,Completed,2020-12-31 22:10,,Terence,Martin,,,,,...,27.96,0,1,Copywriting Course Level 4,1,6.99,0.0,0.0,,
1,Course Gate,Completed,2020-12-31 22:10,,Terence,Martin,,,,,...,27.96,0,2,Advanced Web Development Course,1,6.99,,,,
2,Course Gate,Completed,2020-12-31 22:10,,Terence,Martin,,,,,...,27.96,0,3,Persuasion and Influence Psychology,1,6.99,,,,
3,Course Gate,Completed,2020-12-31 22:10,,Terence,Martin,,,,,...,27.96,0,4,Bitcoin and Cryptocurrency Course,1,6.99,,,,
4,Course Gate,Completed,2020-12-31 19:58,Course selected: Paralegal Training Diploma,Kira,Barcelo,,,,,...,10.99,0,1,10.99 course deal,1,10.99,,,,


In [3]:
# Rename columns and drop unnecessary columns
columnsDict = {"provider":"courseProvider",
               "Order Number":"orderNumber",
               "Order Status":"orderStatus",
               "Order Date":"orderDate",
               "Customer Note":"customerNote",
               "First Name (Billing)":"firstNameBilling",
               "Last Name (Billing)":"lastNameBilling",
               "Company (Billing)":"companyBilling",
               "Address 1&2 (Billing)":"addressBilling",
               "City (Billing)":"cityBilling",
               "State Code (Billing)":"stateCodeBilling",
               "Statecode (Billing)":"stateCodeBilling",
               "Post Code (Shipping)":"postCodeShipping",
               "Country Code (Billing)":"countryCodeBilling",
               "Email (Billing)":"emailBilling",
               "Phone (Billing)":"phoneBilling",
               "First Name (Shipping)":"firstNameShipping",
               "Last Name (Shipping)":"lastNameShipping",
               "Address 1&2 (Shipping)":"addressShipping",
               "City (Shipping)":"cityShipping",
               "State Code (Shipping)":"stateCodeShipping",
               "Postcode (Shipping)":"postCodeShipping",
               "Country Code (Shipping)":"countryCodeShipping",
               "Payment Method Title":"paymentMethodTitle",
               "Cart Discount Amount":"cartDiscountAmount",
               "Order Subtotal Amount":"orderSubtotalAmount",
               "Shipping Method Title":"shippingMethodTitle",
               "Order Shipping Amount":"orderShippingAmount",
               "Order Refund Amount":"orderRefundAmount",
               "Order Total Amount":"orderTotalAmount",
               "Order Total Tax Amount":"orderTotalTaxAmount",
               "SKU":"sku",
               "Item #":"itemNum",
               "Item Name":"itemName",
               "Quantity":"quantity",
               "Item Cost":"itemCost",
               "Coupon Code":"couponCode",
               "Discount Amount":"discountAmount",
               "Discount Amount Tax":"discountAmountTax",
               "Transaction ID":"transactionId",
               "Transaction ID.1":"transactionId1",
               "Currency":"currency",
               "Order Total Amount.1":"orderTotalAmount1",
               "Postcode (Billing)":"postCodeBilling"               
               }

dfColRenamed = df.rename(columns=columnsDict)

In [4]:
# Convert columns into required dtypes
dfColRenamed = dfColRenamed.convert_dtypes()
dfColRenamed.dtypes

courseProvider         string
orderStatus            string
orderDate              string
customerNote           string
firstNameBilling       string
lastNameBilling        string
companyBilling         string
addressBilling         string
cityBilling            string
stateCodeBilling       string
postCodeBilling        string
countryCodeBilling     string
firstNameShipping      string
lastNameShipping       string
addressShipping        string
cityShipping           string
stateCodeShipping       Int64
postCodeShipping        Int64
countryCodeShipping     Int64
paymentMethodTitle     string
cartDiscountAmount     string
orderSubtotalAmount    string
shippingMethodTitle    string
orderShippingAmount    string
orderRefundAmount      string
orderTotalAmount       string
orderTotalTaxAmount    string
itemNum                string
itemName               string
quantity               string
itemCost               string
discountAmount         string
discountAmountTax      string
currency  

In [5]:
# Convert columns into required dtypes
dfColRenamed[[
             "cartDiscountAmount",
             "orderSubtotalAmount",
             "orderShippingAmount",
             "orderRefundAmount",
             "orderTotalAmount",
             "orderTotalTaxAmount",
             "itemNum",
             "quantity",
             "itemCost",
             "discountAmount",
             "discountAmountTax",
             "orderTotalAmount1"]] \
= dfColRenamed[["cartDiscountAmount",
             "orderSubtotalAmount",
             "orderShippingAmount",
             "orderRefundAmount",
             "orderTotalAmount",
             "orderTotalTaxAmount",
             "itemNum",
             "quantity",
             "itemCost",
             "discountAmount",
             "discountAmountTax",
             "orderTotalAmount1"
             ]].apply(lambda x: pd.to_numeric(x, errors="coerce"))

In [6]:
# Check out the column names now
dfColRenamed.columns

Index(['courseProvider', 'orderStatus', 'orderDate', 'customerNote',
       'firstNameBilling', 'lastNameBilling', 'companyBilling',
       'addressBilling', 'cityBilling', 'stateCodeBilling', 'postCodeBilling',
       'countryCodeBilling', 'firstNameShipping', 'lastNameShipping',
       'addressShipping', 'cityShipping', 'stateCodeShipping',
       'postCodeShipping', 'countryCodeShipping', 'paymentMethodTitle',
       'cartDiscountAmount', 'orderSubtotalAmount', 'shippingMethodTitle',
       'orderShippingAmount', 'orderRefundAmount', 'orderTotalAmount',
       'orderTotalTaxAmount', 'itemNum', 'itemName', 'quantity', 'itemCost',
       'discountAmount', 'discountAmountTax', 'currency', 'orderTotalAmount1'],
      dtype='object')

In [7]:
# Lower case rquired columns
dfColRenamed[dfColRenamed.select_dtypes("string").columns] = dfColRenamed.select_dtypes("string").apply(lambda x: x.str.lower().str.strip())

In [8]:
# Create gender from first name
age_from_name = AgeFromName()

# Create a function to extract gender from first name using multiprocessing
def extractGender(name):
    """name = first name"""
    
    # Apply the function from the module
    isMale = age_from_name.prob_male(name)
    
    df = pd.DataFrame({
        "name":name,
        "isMale":isMale
    }, index=[0])
    
    return df

In [9]:
%%time
# Extract gender from first name
# Apply the function using multiprocessing to speed up the conversion
"""This is an 8 core, 16 thread cpu. This time will vary depending on cpu config"""
with ProcessPoolExecutor() as ex:
    genderDf = pd.concat(list(ex.map(extractGender, dfColRenamed.firstNameBilling.unique())))

CPU times: user 11.6 s, sys: 913 ms, total: 12.5 s
Wall time: 5min 40s


In [10]:
# If probablity is greater than 50%, its male, exactly 50% its neutral and less than 50% its female 
genderDf["gender"] = np.where(genderDf.isMale>0.5, "M",
                             (np.where(genderDf.isMale==0.5, "N", "F")))

In [11]:
# The frequency of gender now
genderDf.gender.value_counts(normalize=True)

N    0.442715
F    0.350442
M    0.206843
Name: gender, dtype: float64

In [12]:
# We have to conver those neutral gender into male and female based on the frequency we found above
# We will convert 62% of neutral into "female" and rest 38% into "male"
genderNeutral = genderDf[genderDf.gender=="N"]
toFemaleLen = round(genderNeutral.shape[0]*0.62)
toMaleLen = genderNeutral.shape[0]-toFemaleLen

# Convert
genderNeutral.iloc[toMaleLen:].gender = "F"
genderNeutral.iloc[:toMaleLen].gender = "M"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [13]:
# Replace neutral of gender with mutated gender and count the male-female ratio
finalGender = pd.concat([genderDf[genderDf.gender!="N"], genderNeutral]).reset_index(drop=True)
finalGender.gender.value_counts(normalize=True)

F    0.624945
M    0.375055
Name: gender, dtype: float64

In [14]:
# Let's see what we've transformed
finalGender.head()

Unnamed: 0,name,isMale,gender
0,terence,0.994269,M
1,kira,0.000809,F
2,stephanie,0.003533,F
3,gerald,0.993997,M
4,lucy,0.001617,F


In [15]:
# Create another function to calculate age from gender and first name
def calculateAge(name, gender):
    """name = first name,
    gender = M or F"""
    
    # Apply the function from the module
    try:
        dateOfBirth = age_from_name.argmax(name, gender)
    except:
        pass
    
    # Create a dataframe
    try:
        df = pd.DataFrame({
            "name":name,
            "gender":gender,
            "age":2021-dateOfBirth
        }, index=[0])
        
    except:
        df = pd.DataFrame({
            "name":name,
            "gender":gender,
            "age":"na"
        }, index=[0])
        
    return df

In [16]:
%%time
# Extract age using name and gender
# Apply the function using multiprocessing to speed up the conversion
with ProcessPoolExecutor() as ex:
    ageDf = pd.concat(list(ex.map(calculateAge, genderDf.name, genderDf.gender)))

CPU times: user 8.67 s, sys: 617 ms, total: 9.29 s
Wall time: 1min 37s


In [17]:
# Merge genderDf and ageDf on name
nameAgeGen = pd.merge(finalGender, ageDf, on="name")
nameAgeGen = nameAgeGen[["name", "gender_x", "age"]].rename(columns={"gender_x":"gender", "name":"firstNameBilling"})

In [18]:
# Insert age and gender into main order data
finalDf = pd.merge(dfColRenamed, nameAgeGen, on="firstNameBilling")
finalDf.head(10)

Unnamed: 0,courseProvider,orderStatus,orderDate,customerNote,firstNameBilling,lastNameBilling,companyBilling,addressBilling,cityBilling,stateCodeBilling,...,itemNum,itemName,quantity,itemCost,discountAmount,discountAmountTax,currency,orderTotalAmount1,gender,age
0,course gate,completed,2020-12-31 22:10,,terence,martin,,,,,...,1.0,copywriting course level 4,1.0,6.99,0.0,0.0,,,M,57
1,course gate,completed,2020-12-31 22:10,,terence,martin,,,,,...,2.0,advanced web development course,1.0,6.99,,,,,M,57
2,course gate,completed,2020-12-31 22:10,,terence,martin,,,,,...,3.0,persuasion and influence psychology,1.0,6.99,,,,,M,57
3,course gate,completed,2020-12-31 22:10,,terence,martin,,,,,...,4.0,bitcoin and cryptocurrency course,1.0,6.99,,,,,M,57
4,course gate,completed,2020-12-31 18:45,,terence,martin,,,,,...,1.0,mastering sales techniques,1.0,6.99,0.0,0.0,,,M,57
5,course gate,completed,2020-12-31 18:45,,terence,martin,,,,,...,2.0,diploma in copywriting,1.0,6.99,,,,,M,57
6,course gate,completed,2020-12-31 18:45,,terence,martin,,,,,...,3.0,diploma in financial management,1.0,6.99,,,,,M,57
7,course gate,completed,2020-12-31 18:45,,terence,martin,,,,,...,4.0,complete wordpress web design diploma,1.0,6.99,,,,,M,57
8,course gate,completed,2020-12-31 18:45,,terence,martin,,,,,...,5.0,5 personal development courses (special bundle...,1.0,6.99,,,,,M,57
9,course gate,completed,2020-12-31 18:45,,terence,martin,,,,,...,6.0,advanced diploma in blogging,1.0,6.99,,,,,M,57


In [19]:
# Create a pickle file to save the data
finalDf.to_pickle("combinedOrdersData.pickle")