In [None]:
# Import dependencies
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt


In [None]:
from datetime import date
import plotly.express as px
import seaborn as sns
# Import linear regression from the SciPy stats module.
from scipy.stats import linregress

In [None]:
clean_customerdata_df = pd.read_csv("Resources/marketing_campaign.csv")
clean_customerdata_df.head(10)

In [None]:
clean_customerdata_df.describe()

In [None]:
print(clean_customerdata_df.shape)
clean_customerdata_df.info()

In [None]:
clean_customerdata_df.isnull().sum()

In [None]:
#drop rows where income is NA
clean_customerdata_df.dropna(axis = 0, inplace = True)

# Income

In [None]:
#Plot a histogram for income <$600000

plt.subplot(211)
clean_customerdata_df.hist(column = 'Income')
plt.xlabel("Income")
plt.ylabel("No of Customers")
plt.title('Histogram of Income')
plt.subplot(212)
clean_customerdata_df.query("Income < 600000")['Income'].plot.hist()
plt.xlabel("Income")
plt.ylabel("No of Customers")
plt.title('Histogram of Income (<$600000)')

plt.show()

In [None]:
fig = px.histogram(
    clean_customerdata_df['Income'],
    title = "Customer's Income Distribution",
    nbins = 30
)
fig.update_layout(
    xaxis_title = "Customer's Income",
    yaxis_title = "Frequency")

fig.show()

In [None]:
clean_customerdata_df.drop(clean_customerdata_df[clean_customerdata_df.Income > 600000].index, inplace = True)

In [None]:
fig = px.histogram(
    clean_customerdata_df['Income'],
    title = "Customer's Income Distribution (<$600K)",
    nbins = 40
)
fig.update_layout(
    xaxis_title = "Customer's Income",
    yaxis_title = "Frequency")

fig.show()

#clean_customerdata_df.query("Income < 600000")['Income']

names = ['group_a', 'group_b', 'group_c']
values = [1, 10, 100]

plt.figure(figsize=(9, 3))

plt.subplot(131)
plt.bar(names, values)
plt.subplot(132)
plt.scatter(names, values)
plt.subplot(133)
plt.plot(names, values)
plt.suptitle('Categorical Plotting')
plt.show()

# Age

In [None]:
# Year of Birth
def get_age(year_birth):
    today = date.today()
    age = today.year - year_birth
    return age

In [None]:
clean_customerdata_df["Age"] = clean_customerdata_df["Year_Birth"].apply(get_age)
clean_customerdata_df["Age"].describe()

In [None]:
fig = px.histogram(
    clean_customerdata_df["Age"],
    title = "Customer's Age Distribution",
    nbins = 25,
    color_discrete_sequence=['indianred']
)
fig.update_layout(
    xaxis_title = "Customer's Age",
    yaxis_title = "Frequency")

fig.show()

In [None]:
clean_customerdata_df.drop(clean_customerdata_df[clean_customerdata_df.Age > 90].index, inplace = True)

In [None]:
fig = px.histogram(
    clean_customerdata_df["Age"],
    title = "Customer's Age Distribution",
    nbins = 25,
    color_discrete_sequence=['indianred']
)
fig.update_layout(
    xaxis_title = "Customer's Age",
    yaxis_title = "Frequency")

fig.show()

### Stat Canada Age Categories:
Children (00-14 years)  
Youth (15-24 years)  
Adults (25-64 years)  
Seniors (65 years and over)  

In [None]:
# dividing age into groups
clean_customerdata_df['Age_Group'] = pd.cut(x = clean_customerdata_df['Age'], bins = [0, 14, 24, 64, 90],
                         labels = ['Child','Youth','Adult','Senior'])

In [None]:
print("Count per category in the feature Age_Group:\n", clean_customerdata_df["Age_Group"].value_counts(), "\n")

In [None]:
fig = px.bar(clean_customerdata_df, x='Age_Group')
fig.show()

# Date Joined

In [None]:
clean_customerdata_df["Date_Enrolled"] = pd.to_datetime(clean_customerdata_df["Date_Enrolled"])

dates = []
for x in clean_customerdata_df["Date_Enrolled"]:
    x = x.date()
    dates.append(x)
#print(dates)    
#Enrollment dates of the newest and oldest recorded customer
print("The newest customer enrolled on:", max(dates))
print("The oldest customer enrolled on:", min(dates))

In [None]:
#Cust_Days (ref: https://www.codegrepper.com/code-examples/python/extract+number+of+days+from+datatime+python)
days = []
max_date = max(dates) #taking it to be the newest customer
for x in dates:
    diff = max_date - x
    days.append(diff)
clean_customerdata_df["Cust_Days"] = days
#print(days)

clean_customerdata_df["Cust_Days"] = pd.to_numeric(clean_customerdata_df["Cust_Days"].dt.days, downcast = 'integer')
#https://stackoverflow.com/questions/25646200/python-convert-timedelta-to-int-in-a-dataframe

#clean_customerdata_df["Cust_Days"] = (days / np.timedelta64(1, 'D')).astype(int)
#clean_customerdata_df['Cust_Days2'] = clean_customerdata_df['Cust_Days'].apply(lambda z: z.value)
#clean_customerdata_df['Cust_Days2'] = clean_customerdata_df['Cust_Days'].divide(86400)

In [None]:
# The day, month and year cutomer joined
clean_customerdata_df['Year_Joined'] = clean_customerdata_df['Date_Enrolled'].dt.year
clean_customerdata_df['Month_Joined'] = clean_customerdata_df['Date_Enrolled'].dt.strftime("%B")
clean_customerdata_df['Day_Joined'] = clean_customerdata_df['Date_Enrolled'].dt.day_name()


In [None]:
fig = px.histogram(clean_customerdata_df, x="Month_Joined")
fig.update_layout(bargap=0.2)
fig.show()

In [None]:
clean_customerdata_df.plot.scatter(x = 'Cust_Days', y = 'Income')
#plt.xlim(0, 2)
plt.title("No of Days as Customer vs. Income (<$600K)")
plt.ylim(0, 175000)
plt.show(block=True)

# Marital Status, Family & Education

In [None]:
print("Education Values: ", clean_customerdata_df["Education"].unique())
print("Marital_Status Values:", clean_customerdata_df["Marital_Status"].unique())

print("Count per category in the feature Marital_Status:\n", clean_customerdata_df["Marital_Status"].value_counts(), "\n")
print("Count per category in the feature Education:\n", clean_customerdata_df["Education"].value_counts())

In [None]:
map_status = {"Together": "Partner", "Married": "Partner",
              "YOLO": "Single", "Absurd": "Single", "Alone": "Single", "Widow": "Single", "Divorced": "Single"}

clean_customerdata_df["Marital_Status_map"] = clean_customerdata_df["Marital_Status"].replace(map_status)


map_edu = {"Graduation": "Graduate",
           "Master": "PostGrad", "PhD": "PostGrad",
           "Basic": "UnderGrad", "2n Cycle": "UnderGrad"}

clean_customerdata_df["Education_map"] = clean_customerdata_df["Education"].replace(map_edu)

In [None]:
print("Count per mapped category in Marital_Status:\n", clean_customerdata_df["Marital_Status_map"].value_counts(), "\n")
print("Count per mapped category in Education:\n", clean_customerdata_df["Education_map"].value_counts())

In [None]:
# Total children living in the household
clean_customerdata_df["Children"] = clean_customerdata_df["Children_Per_Household"] + clean_customerdata_df["Teens_Per_Household"]

In [None]:
#customer_df["Family_Size"] = customer_df["Marital_Status"].replace({0: 1, 1:2})+ customer_df["Total_kids"]
#customer_df["Family_Size"]

clean_customerdata_df["Family_Size"] = clean_customerdata_df["Marital_Status_map"].replace({"Single": 1, "Partner":2}) + clean_customerdata_df["Children"]

clean_customerdata_df["has_kids"] = np.where(clean_customerdata_df.Children> 0, 1, 0)


# Other Stats

In [None]:
# Total number of purchases made per customer
clean_customerdata_df['TotalNumPurchases'] = clean_customerdata_df['Web_Purchases'] + clean_customerdata_df['Catalog_Purchases'] + clean_customerdata_df['Store_Purchases'] + clean_customerdata_df['Deal_Purchases']

# Total number of accepted campaigns per customer
clean_customerdata_df['Total_Acc_Cmp'] = clean_customerdata_df['AcceptedCmp1'] + clean_customerdata_df['AcceptedCmp2'] + clean_customerdata_df['AcceptedCmp3'] + clean_customerdata_df['AcceptedCmp4'] + clean_customerdata_df['AcceptedCmp5'] + clean_customerdata_df['Response']

#Total spendings on various items
clean_customerdata_df["Total_Spent"] = (clean_customerdata_df["Wines"] + clean_customerdata_df["Fruits"] + clean_customerdata_df["Meat"] 
                        + clean_customerdata_df["Fish"] + clean_customerdata_df["Sweets"] + clean_customerdata_df["Gold"])

In [None]:
clean_customerdata_df.info()

In [None]:
To_Plot = [ "Income", "Recency", "Cust_Days", "Age", "Total_Spent", "has_kids", "Family_Size"]
plt.figure()
sns.pairplot(clean_customerdata_df[To_Plot], hue = "has_kids")
plt.show()

In [None]:
corrmat= clean_customerdata_df.corr()
plt.figure(figsize=(20,20))  
sns.heatmap(corrmat,annot=True, center=0)

In [None]:
def plot_linear_regression(x_values, y_values, title, y_label, text_coordinates):
    
    # Run regression on hemisphere weather data.
    (slope, intercept, r_value, p_value, std_err) = linregress(x_values, y_values)
    
    # Calculate the regression line "y values" from the slope and intercept.
    regress_values = x_values * slope + intercept
    
    # Get the equation of the line.
    line_eq = "y = " + str(round(slope, 2)) + "x + " + str(round(intercept, 2))
    
    # Create a scatter plot and plot the regression line.
    plt.scatter(x_values,y_values)
    plt.plot(x_values,regress_values,"r")
    
    # Annotate the text for the line equation.
    plt.annotate(line_eq, text_coordinates, fontsize=15, color="red") #text_coordinates is location of line on plot
    plt.title(title)
    plt.xlabel('Income (<$600k)')
    plt.ylabel(y_label)
    plt.show()
    
    print(f"The p-value is: {p_value}")
    print(f"The r-value is: {r_value:.3f}")

In [None]:
x_values = clean_customerdata_df.query("Income < 600000")['Income']
y_values = clean_customerdata_df.query("Income < 600000")['Total_Spent']

# Call the function.
plot_linear_regression(x_values, y_values,
                       'Correlation on the Income (<$600k) & Total Spent', 'Total Spent ($)',(10,0))